diff --git a/.gitmodules b/.gitmodules index f8bad9fc4e..cd3d616ab6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "CMSIS-NN"] path = TargetLibraries/CMSIS/third_party/CMSIS-NN url = https://github.com/ARM-software/CMSIS-NN.git +[submodule "pulp-trainlib"] + path = TargetLibraries/PULPOpen/third_party/pulp-trainlib + url = https://github.com/runwangdl/pulp-trainlib.git diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py index aba6740d49..d4d7f6df99 100644 --- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py +++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py @@ -229,13 +229,14 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f if node.op in ["RequantizedConv", "Conv"]: spatialDims = len(node.inputs[1].shape) - 2 - elif node.op == "MaxPool": + elif node.op in ["MaxPool", "AveragePool", "AveragePoolGrad"]: spatialDims = len(node.attrs["kernel_shape"]) elif node.op == "Pad": spatialDims = 2 # Hack based on current status else: raise ValueError(f"Cannot determine spatialDims for node {node.name} with operator {node.op}") + # Insert Transpose nodes around the op's activation input/output to convert the data layout. permuteIn = _transformLayoutPermutation(len(tensorIn.shape), spatialDims, default_channels_first) graph.nodes.append(_appendTranspose(tensorIn, node, permuteIn)) @@ -245,7 +246,14 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f if node.op in ["Conv", "RequantizedConv"]: # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift] for tensor in node.inputs[1:]: - _transformLayoutConst(tensor, spatialDims, default_channels_first) + if isinstance(tensor, gs.Constant): + # Inference graph: weight is a fixed constant — permute its data in-place. + _transformLayoutConst(tensor, spatialDims, default_channels_first) + elif isinstance(tensor, gs.Variable) and tensor.shape is not None and len(tensor.shape) >= 2: + # Training graph: weight is a Variable (updated by the optimizer) — cannot permute + # in-place, so insert an explicit Transpose node that will run at inference/forward time. + perm = _transformLayoutPermutation(len(tensor.shape), spatialDims, default_channels_first) + graph.nodes.append(_appendTranspose(tensor, node, perm)) node.attrs["channels_first"] = default_channels_first @@ -261,6 +269,24 @@ def __init__(self, default_channels_first: bool = True): super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name) +@contextagnostic +class NCHWtoNHWCAveragePoolPass(ReplaceSequentialPatternPass): + + def __init__(self, default_channels_first: bool = True): + graph = _singleNodePattern(op = "AveragePool") + name = "_NCHW_TO_NHWC_AVERAGEPOOL_PASS" + super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name) + + +@contextagnostic +class NCHWtoNHWCAveragePoolGradPass(ReplaceSequentialPatternPass): + + def __init__(self, default_channels_first: bool = True): + graph = _singleNodePattern(op = "AveragePoolGrad") + name = "_NCHW_TO_NHWC_AVERAGEPOOLGRAD_PASS" + super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name) + + @contextagnostic class NCHWtoNHWCConvPass(ReplaceSequentialPatternPass): @@ -363,6 +389,8 @@ def __init__(self, default_channels_first: bool = True): passes = [ NCHWtoNHWCPadPass(default_channels_first), NCHWtoNHWCMaxPoolPass(default_channels_first), + NCHWtoNHWCAveragePoolPass(default_channels_first), + NCHWtoNHWCAveragePoolGradPass(default_channels_first), NCHWtoNHWCDwConvPass(default_channels_first), NCHWtoNHWCConvPass(default_channels_first), ] @@ -376,6 +404,8 @@ def __init__(self, default_channels_first: bool = True): passes = [ NCHWtoNHWCPadPass(default_channels_first), NCHWtoNHWCMaxPoolPass(default_channels_first), + NCHWtoNHWCAveragePoolPass(default_channels_first), + NCHWtoNHWCAveragePoolGradPass(default_channels_first), PULPNCHWtoNHWCDwConvPass(default_channels_first), NCHWtoNHWCConvPass(default_channels_first), ] @@ -533,8 +563,10 @@ def _remove_only_singleton_reduce_mean(graph: gs.Graph, match: Match, name: str) # Delete node if only reduction over singleton dimensions if 'axis' in node.attrs: axis = node.attrs['axis'] - else: + elif len(node.inputs) > 1 and node.inputs[1] is not None and hasattr(node.inputs[1], 'values') and node.inputs[1].values is not None: axis = node.inputs[1].values + else: + return graph # axis unknown, skip # Check if shape information is available if node.inputs[0].shape is not None and all(node.inputs[0].shape[ax] == 1 for ax in axis): diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 4dc1819191..771f00c07d 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -336,14 +336,14 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool: True if this VariableBuffer has any live aliases, False otherwise """ # Do a breadth-first search across the aliasing double-linked list - live = self._live + live = self._live or self.is_input or self.is_output queue = set(self.aliases) visited = set(self.name) while len(queue) > 0: next = queue.pop() buffNext = ctxt.lookup(next) assert isinstance(buffNext, VariableBuffer) - live |= buffNext._live + live |= buffNext._live or buffNext.is_input or buffNext.is_output visited.add(next) queue |= buffNext.aliases - visited return live diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py index 2bda98af8f..b95a73d5b8 100644 --- a/Deeploy/Targets/GAP9/Bindings.py +++ b/Deeploy/Targets/GAP9/Bindings.py @@ -23,10 +23,15 @@ # Import templates from PULPOpen and Generic from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \ FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ - GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ - QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormInternalChecker, \ + BatchNormalizationGradChecker, BNGradNormalizeChecker, BNGradReduceChecker, \ + ChannelNormalizeChecker, ConcatChecker, ConvChecker, DequantChecker, \ + WelfordReduceChecker, \ + GatherChecker, GELUChecker, GEMMChecker, GlobalAveragePoolChecker, GlobalAveragePoolGradChecker, \ + HardswishChecker, InPlaceAccumulatorV2Checker, LayerNormChecker, MatMulChecker, MaxPoolGradChecker, MulChecker, \ + MSELossChecker, QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, \ + SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, \ + PULPConvGradBChecker from Deeploy.Targets.PULPOpen.Bindings import ForkClosure, L3MemoryAwareFunctionCallClosure, \ MemoryAwareForkTransformer, MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass @@ -34,14 +39,17 @@ from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ - FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ - FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, \ - MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, \ + FloatAveragePoolTemplate, FloatBatchNormTemplate, FloatConvGradTemplate, FloatConvTemplate, \ + FloatGELUTemplate, FloatGemmTemplate, FloatGlobalAveragePoolTemplate, \ + FloatInPlaceAccumulatorV2Template, FloatLayernormTemplate, FloatMatMulTemplate, \ + FloatMaxPoolTemplate, FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ + MatrixVectorTemplate, MaxPoolTemplate, MSELossTemplate, MulTemplate, ReduceMeanTemplate, \ + RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ iRMSNormTemplate, iSoftmaxTemplate -from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ - PULPRequantShiftChecker +from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, \ + PULPMaxPoolChecker, PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ TilingVariableReplacementUpdate @@ -306,6 +314,14 @@ SoftmaxCrossEntropyLossTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes ] +# Dual-output binding: outputs[0]=loss (scalar), outputs[1]=log_prob +GAP9SoftmaxCrossEntropyLossDualOutputBindings = [ + NodeBinding( + SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], + [PointerClass(float32_t), PointerClass(float32_t)]), + SoftmaxCrossEntropyLossTemplate.referenceDualOutputTemplate, GAP9Transformer) for type in IntegerDataTypes +] + GAP9SoftmaxCrossEntropyLossGradBindings = [ NodeBinding( SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), @@ -317,6 +333,179 @@ SGDTemplate.referenceTemplate, GAP9Transformer) ] +# ── Training / Gradient bindings ───────────────────────────────────────── + +GAP9ReluGradBinding = NodeBinding( + ReluChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReluTemplate.referenceGradTemplate, GAP9Transformer) + +GAP9FloatGELUGradBinding = NodeBinding( + GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGELUTemplate.referenceGradTemplate, GAP9Transformer) + +GAP9LayernormGradBinding = NodeBinding( + LayerNormChecker( + [PointerClass(float32_t), + PointerClass(float32_t), + PointerClass(float32_t), + PointerClass(float32_t), + PointerClass(float32_t)], + [PointerClass(float32_t), + PointerClass(float32_t), + PointerClass(float32_t)]), FloatLayernormTemplate.referenceGradTemplate, + GAP9Transformer) + +GAP9FloatConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradW2DIm2ColTemplate, + GAP9ClusterTransformer) +] + +GAP9FloatConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradX2DIm2ColTiledTemplate, + GAP9Transformer) +] + +GAP9FloatDWConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceDWConvGradX2DTiledTemplate, + GAP9Transformer) +] + +GAP9FloatDWConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceDWConvGradW2DTemplate, + GAP9ClusterTransformer) +] + +GAP9FloatPWConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referencePWConvGradW2DTemplate, + GAP9ClusterTransformer) +] + +GAP9FloatPWConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referencePWConvGradX2DTemplate, + GAP9ClusterTransformer) +] + +GAP9FloatConvGradBBindings = [ + NodeBinding( + PULPConvGradBChecker([PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradB2DTemplate, + GAP9ClusterTransformer) +] + +GAP9MaxPoolGrad2DBindings = [ + NodeBinding( + MaxPoolGradChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMaxPoolTemplate.referenceGradTemplate, GAP9Transformer) +] + +GAP9AveragePool2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9AveragePoolGrad2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceGradTemplate, GAP9Transformer) +] + +GAP9GlobalAveragePool2DBindings = [ + NodeBinding( + GlobalAveragePoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolTemplate, + GAP9Transformer) +] + +GAP9GlobalAveragePoolGrad2DBindings = [ + NodeBinding( + GlobalAveragePoolGradChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolGradTemplate, + GAP9Transformer) +] + +GAP9MSELossBindings = [ + NodeBinding(MSELossChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MSELossTemplate.referenceTemplate, GAP9Transformer) +] + +GAP9MSELossGradBindings = [ + NodeBinding(MSELossChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MSELossTemplate.referenceGradientTemplate, GAP9Transformer) +] + +GAP9InPlaceAccumulatorV2Bindings = [ + NodeBinding( + InPlaceAccumulatorV2Checker( + [PointerClass(float32_t), PointerClass(float32_t), PointerClass(uint8_t)], [PointerClass(float32_t)]), + FloatInPlaceAccumulatorV2Template.referenceTemplate, GAP9Transformer) +] + +GAP9InPlaceAccumulatorV2TiledBindings = [ + NodeBinding( + InPlaceAccumulatorV2Checker( + [PointerClass(float32_t), PointerClass(float32_t), PointerClass(uint8_t)], [PointerClass(float32_t)]), + FloatInPlaceAccumulatorV2Template.tiledReferenceTemplate, GAP9Transformer) +] + +GAP9BatchNormInternalBindings = [ + NodeBinding( + BatchNormInternalChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 5), FloatBatchNormTemplate.batchNormInternalTemplate, + GAP9Transformer) +] + +GAP9BatchNormalizationGradBindings = [ + NodeBinding( + BatchNormalizationGradChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 3), FloatBatchNormTemplate.batchNormGradTemplate, + GAP9Transformer) +] + +GAP9WelfordReduceBindings = [ + NodeBinding( + WelfordReduceChecker( + [PointerClass(float32_t)] * 1, + [PointerClass(float32_t)] * 2), FloatBatchNormTemplate.welfordReduceTemplate, + GAP9Transformer) +] + +GAP9ChannelNormalizeBindings = [ + NodeBinding( + ChannelNormalizeChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 1), FloatBatchNormTemplate.channelNormalizeTemplate, + GAP9Transformer) +] + +GAP9BNGradReduceBindings = [ + NodeBinding( + BNGradReduceChecker( + [PointerClass(float32_t)] * 4, + [PointerClass(float32_t)] * 2), FloatBatchNormTemplate.bnGradReduceTemplate, + GAP9Transformer) +] + +GAP9BNGradNormalizeBindings = [ + NodeBinding( + BNGradNormalizeChecker( + [PointerClass(float32_t)] * 7, + [PointerClass(float32_t)] * 1), FloatBatchNormTemplate.bnGradNormalizeTemplate, + GAP9Transformer) +] + GAP9TransposeBindings = [ NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate, GAP9Transformer) for type in IntegerDataTypes @@ -328,6 +517,9 @@ GAP9ConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float_type), PointerClass(float_type)], [PointerClass(float_type)]), + ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for float_type in FloatDataTypes ] GAP9iRMSNormBindings = [ @@ -370,7 +562,18 @@ GAP9ReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatReluTemplate.referenceTemplate, GAP9Transformer) +# Forward LayerNorm: 3 inputs (data, weight, bias), 3 outputs (Y, mean_stash, inv_std_stash) +# The 3-output version is needed for training (backward pass needs mean and inv_std stashes). GAP9LayernormBinding = NodeBinding( + LayerNormChecker( + [PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], + [PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate, + GAP9Transformer) + +# Inference-only LayerNorm: 3 inputs, 1 output (Y only, no stashes) +GAP9LayernormInferenceBinding = NodeBinding( LayerNormChecker( [PointerClass(float32_t), PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate, diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py index adbf161328..d3df96702d 100644 --- a/Deeploy/Targets/GAP9/DMA/L3Dma.py +++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py @@ -29,7 +29,7 @@ class GAP9L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture) diff --git a/Deeploy/Targets/GAP9/Platform.py b/Deeploy/Targets/GAP9/Platform.py index bad6f8d859..eb0c89bbe8 100644 --- a/Deeploy/Targets/GAP9/Platform.py +++ b/Deeploy/Targets/GAP9/Platform.py @@ -11,40 +11,67 @@ from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper from Deeploy.Targets.GAP9.Templates import AllocateTemplate, FreeTemplate # Import GAP9-specific tiler bindings -from Deeploy.Targets.GAP9.Tiler import GAP9AddTilingReadyBindings, GAP9ConcatTilingReadyBindings, \ - GAP9Conv2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, GAP9FlattenTilingReadyBindings, \ - GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, GAP9GatherTilingReadyBindings, \ - GAP9iHardswishTilingReadyBindings, GAP9iRMSNormTilingReadyBindings, GAP9iRQSGELUTilingReadyBindings, \ - GAP9LayernormTilingReadyBindings, GAP9MatMulTilingReadyBindings, GAP9MaxPool2DTilingReadyBindings, \ - GAP9MulTilingReadyBindings, GAP9ReduceSumTilingReadyBindings, GAP9ReluTilingReadyBindings, \ +from Deeploy.Targets.GAP9.Tiler import GAP9AddTilingReadyBindings, GAP9AveragePool2DTilingReadyBindings, \ + GAP9AveragePoolGrad2DTilingReadyBindings, GAP9BatchNormInternalTilingReadyBindings, \ + GAP9BatchNormalizationGradTilingReadyBindings, GAP9ConcatTilingReadyBindings, \ + GAP9Conv2DTilingReadyBindings, GAP9ConvGradBTilingReadyBindings, GAP9ConvGradW2DTilingReadyBindings, \ + GAP9ConvGradX2DTilingReadyBindings, GAP9DWConv2DTilingReadyBindings, GAP9DWConvGradW2DTilingReadyBindings, \ + GAP9DWConvGradX2DTilingReadyBindings, GAP9FlattenTilingReadyBindings, \ + GAP9FPGELUGradTilingReadyBindings, GAP9FPGELUTilingReadyBindings, GAP9FPGEMMTilingReadyBindings, \ + GAP9GatherTilingReadyBindings, GAP9GlobalAveragePool2DTilingReadyBindings, \ + GAP9GlobalAveragePoolGrad2DTilingReadyBindings, \ + GAP9iHardswishTilingReadyBindings, GAP9InPlaceAccumulatorV2TilingReadyBindings, \ + GAP9iRMSNormTilingReadyBindings, GAP9iRQSGELUTilingReadyBindings, \ + GAP9LayernormGradTilingReadyBindings, GAP9LayernormTilingReadyBindings, \ + GAP9MatMulTilingReadyBindings, GAP9MaxPool2DTilingReadyBindings, GAP9MaxPoolGrad2DTilingReadyBindings, \ + GAP9MSELossGradTilingReadyBindings, GAP9MSELossTilingReadyBindings, \ + GAP9MulTilingReadyBindings, GAP9PWConvGradW2DTilingReadyBindings, GAP9PWConvGradX2DTilingReadyBindings, \ + GAP9ReduceSumTilingReadyBindings, GAP9ReluGradTilingReadyBindings, GAP9ReluTilingReadyBindings, \ GAP9RQAddTilingReadyBindings, GAP9RQSConv2DTilingReadyBindings, GAP9RQSDWConv2DTilingReadyBindings, \ GAP9RQSGEMMTilingReadyBindings, GAP9RQSiHardswishTilingReadyBindings, GAP9RQSMatrixVecTilingReadyBindings, \ GAP9RQSTallGEMMTilingReadyBindings, GAP9RQSTilingReadyBindings, GAP9SGDTilingReadyBindings, \ - GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \ + GAP9SoftmaxCrossEntropyGradTilingReadyBindings, GAP9SoftmaxCrossEntropyLossDualOutputTilingReadyBindings, \ + GAP9SliceTilingReadyBindings, GAP9SoftmaxCrossEntropyTilingReadyBindings, \ GAP9SoftmaxGradTilingReadyBindings, GAP9SoftmaxTilingReadyBindings, GAP9TransposeTilingReadyBindings, \ - GAP9UniformRQSTilingReadyBindings + GAP9UniformRQSTilingReadyBindings, \ + GAP9WelfordReduceTilingReadyBindings, GAP9ChannelNormalizeTilingReadyBindings, \ + GAP9BNGradReduceTilingReadyBindings, GAP9BNGradNormalizeTilingReadyBindings from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \ - LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \ - ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ - SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ - TransposeLayer, iHardswishLayer, iRMSNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \ - QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \ - RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ +from Deeploy.Targets.Generic.Layers import AddLayer, AveragePoolGradLayer, AveragePoolLayer, \ + BatchNormInternalLayer, BatchNormalizationGradLayer, BNGradNormalizeLayer, BNGradReduceLayer, \ + ChannelNormalizeLayer, ConcatLayer, ConvLayer, \ + WelfordReduceLayer, \ + ConvGradBLayer, ConvGradWLayer, ConvGradXLayer, GatherLayer, GELUGradLayer, GELULayer, GEMMLayer, \ + GlobalAveragePoolLayer, GlobalAveragePoolGradLayer, \ + InPlaceAccumulatorV2Layer, \ + LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolGradLayer, MaxPoolLayer, \ + MSELossGradLayer, MSELossLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \ + ReluGradLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ + RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ + SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, AveragePool2DParser, BatchNormInternalParser, \ + BatchNormalizationGradParser, BNGradNormalizeParser, BNGradReduceParser, \ + ChannelNormalizeParser, ConcatParser, Conv2DGradBParser, DequantParser, FlattenParser, GatherParser, \ + WelfordReduceParser, \ + GELUGradParser, GELUParser, GEMMParser, GlobalAveragePoolParser, GlobalAveragePoolGradParser, \ + InPlaceAccumulatorV2Parser, \ + LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, MaxPoolGradParser, \ + MSELossGradParser, MSELossParser, MulParser, Pad1DParser, Pad2DParser, \ + QuantParser, ReduceMeanParser, ReduceSumParser, ReluGradParser, ReluParser, RequantShiftParser, ReshapeParser, \ + RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \ PULPDWConv1DBinding, PULPReduceMeanBindings, PULPRQSConv1DBindings, PULPSliceBindings from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer -from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ - PULPTallGEMMParser +from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPConvGradW2DParser, \ + PULPConvGradX2DParser, PULPDWConv1DParser, PULPDWConv2DParser, PULPDWConvGradW2DParser, \ + PULPDWConvGradX2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ + PULPPWConvGradW2DParser, PULPPWConvGradX2DParser, PULPTallGEMMParser -# Create GAP9-specific NodeMappers +# ── Inference NodeMappers ────────────────────────────────────────────────── GAP9_RQAddMapper = NodeMapper(RQAddParser(), GAP9RQAddTilingReadyBindings) GAP9_AddMapper = NodeMapper(AddParser(), GAP9AddTilingReadyBindings) GAP9_FlattenMapper = NodeMapper(FlattenParser(), GAP9FlattenTilingReadyBindings) @@ -77,25 +104,55 @@ GAP9_LayerNormMapper = NodeMapper(LayerNormParser(), GAP9LayernormTilingReadyBindings) GAP9_ReluMapper = NodeMapper(ReluParser(), GAP9ReluTilingReadyBindings) GAP9_SoftmaxMapper = NodeMapper(SoftmaxParser(), GAP9SoftmaxTilingReadyBindings) -GAP9_SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), GAP9SoftmaxGradTilingReadyBindings) GAP9_Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), GAP9SoftmaxTilingReadyBindings) GAP9_ConcatMapper = NodeMapper(ConcatParser(), GAP9ConcatTilingReadyBindings) GAP9_DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) -GAP9_SliceMapper = NodeMapper(SliceParser(), PULPSliceBindings) +GAP9_SliceMapper = NodeMapper(SliceParser(), GAP9SliceTilingReadyBindings) GAP9_iRMSNormMapper = NodeMapper(iRMSNormParser(), GAP9iRMSNormTilingReadyBindings) GAP9_iHardswishMapper = NodeMapper(iHardswishParser(), GAP9iHardswishTilingReadyBindings) GAP9_RQSiHardswishMapper = NodeMapper(RQSiHardswishParser(), GAP9RQSiHardswishTilingReadyBindings) +GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) +GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) +GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) + +# ── Training / Gradient NodeMappers ─────────────────────────────────────── +GAP9_GELUGradMapper = NodeMapper(GELUGradParser(), GAP9FPGELUGradTilingReadyBindings) +GAP9_ConvGradXMapper = NodeMapper(PULPConvGradX2DParser(), GAP9ConvGradX2DTilingReadyBindings) +GAP9_DWConvGradXMapper = NodeMapper(PULPDWConvGradX2DParser(), GAP9DWConvGradX2DTilingReadyBindings) +GAP9_PWConvGradX2DMapper = NodeMapper(PULPPWConvGradX2DParser(), GAP9PWConvGradX2DTilingReadyBindings) +GAP9_ConvGradWMapper = NodeMapper(PULPConvGradW2DParser(), GAP9ConvGradW2DTilingReadyBindings) +GAP9_DWConvGradWMapper = NodeMapper(PULPDWConvGradW2DParser(), GAP9DWConvGradW2DTilingReadyBindings) +GAP9_PWConvGradW2DMapper = NodeMapper(PULPPWConvGradW2DParser(), GAP9PWConvGradW2DTilingReadyBindings) +GAP9_ConvGradBMapper = NodeMapper(Conv2DGradBParser(), GAP9ConvGradBTilingReadyBindings) +GAP9_LayerNormGradMapper = NodeMapper(LayerNormGradParser(), GAP9LayernormGradTilingReadyBindings) +GAP9_AveragePool2DMapper = NodeMapper(AveragePool2DParser(), GAP9AveragePool2DTilingReadyBindings) +GAP9_AveragePoolGrad2DMapper = NodeMapper(AveragePool2DParser(), GAP9AveragePoolGrad2DTilingReadyBindings) +GAP9_MaxPoolGrad2DMapper = NodeMapper(MaxPoolGradParser(), GAP9MaxPoolGrad2DTilingReadyBindings) +GAP9_ReluGradMapper = NodeMapper(ReluGradParser(), GAP9ReluGradTilingReadyBindings) +GAP9_SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), GAP9SoftmaxGradTilingReadyBindings) GAP9_SoftmaxCrossEntropyLossMapper = NodeMapper(SoftmaxCrossEntropyLossParser(), GAP9SoftmaxCrossEntropyTilingReadyBindings) +# Dual-output mapper (loss + log_prob): tried first; falls back to single-output mapper for 1-output nodes +GAP9_SoftmaxCrossEntropyLossDualOutputMapper = NodeMapper(SoftmaxCrossEntropyLossParser(), + GAP9SoftmaxCrossEntropyLossDualOutputTilingReadyBindings) GAP9_SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(), GAP9SoftmaxCrossEntropyGradTilingReadyBindings) GAP9_SGDMapper = NodeMapper(SGDParser(), GAP9SGDTilingReadyBindings) -GAP9_QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) -GAP9_DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) -GAP9_GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +GAP9_MSELossMapper = NodeMapper(MSELossParser(), GAP9MSELossTilingReadyBindings) +GAP9_MSELossGradMapper = NodeMapper(MSELossGradParser(), GAP9MSELossGradTilingReadyBindings) +GAP9_InPlaceAccumulatorV2Mapper = NodeMapper(InPlaceAccumulatorV2Parser(), GAP9InPlaceAccumulatorV2TilingReadyBindings) +GAP9_BatchNormInternalMapper = NodeMapper(BatchNormInternalParser(), GAP9BatchNormInternalTilingReadyBindings) +GAP9_BatchNormalizationGradMapper = NodeMapper(BatchNormalizationGradParser(), GAP9BatchNormalizationGradTilingReadyBindings) +GAP9_WelfordReduceMapper = NodeMapper(WelfordReduceParser(), GAP9WelfordReduceTilingReadyBindings) +GAP9_ChannelNormalizeMapper = NodeMapper(ChannelNormalizeParser(), GAP9ChannelNormalizeTilingReadyBindings) +GAP9_BNGradReduceMapper = NodeMapper(BNGradReduceParser(), GAP9BNGradReduceTilingReadyBindings) +GAP9_BNGradNormalizeMapper = NodeMapper(BNGradNormalizeParser(), GAP9BNGradNormalizeTilingReadyBindings) +GAP9_GlobalAveragePoolMapper = NodeMapper(GlobalAveragePoolParser(), GAP9GlobalAveragePool2DTilingReadyBindings) +GAP9_GlobalAveragePoolGradMapper = NodeMapper(GlobalAveragePoolGradParser(), GAP9GlobalAveragePoolGrad2DTilingReadyBindings) # GAP9-specific mapping using ClDma GAP9Mapping = { + # ── Inference operators ─────────────────────────────────────────────── 'Conv': ConvLayer([GAP9_FPConv2DMapper, GAP9_FPDWConv2DMapper]), 'RequantizedConv': @@ -164,14 +221,56 @@ QuantLayer([GAP9_QuantMapper]), 'Dequant': QuantLayer([GAP9_DequantMapper]), + 'AveragePool': + AveragePoolLayer([GAP9_AveragePool2DMapper]), + # ── Training / Gradient operators ───────────────────────────────────── + 'ConvGradX': + ConvGradXLayer([GAP9_PWConvGradX2DMapper, GAP9_DWConvGradXMapper, GAP9_ConvGradXMapper]), + 'ConvGradW': + ConvGradWLayer([GAP9_PWConvGradW2DMapper, GAP9_DWConvGradWMapper, GAP9_ConvGradWMapper]), + 'ConvGradB': + ConvGradBLayer([GAP9_ConvGradBMapper]), + 'GeluGrad': + GELUGradLayer([GAP9_GELUGradMapper]), + 'LayerNormalizationGrad': + LayerNormGradLayer([GAP9_LayerNormGradMapper]), + 'AveragePoolGrad': + AveragePoolGradLayer([GAP9_AveragePoolGrad2DMapper]), + 'MaxPoolGrad': + MaxPoolGradLayer([GAP9_MaxPoolGrad2DMapper]), + 'ReluGrad': + ReluGradLayer([GAP9_ReluGradMapper]), 'SoftmaxGrad': SoftmaxGradLayer([GAP9_SoftmaxGradMapper]), + 'MSELoss': + MSELossLayer([GAP9_MSELossMapper]), + 'MSELossGrad': + MSELossGradLayer([GAP9_MSELossGradMapper]), 'SoftmaxCrossEntropyLoss': - SoftmaxCrossEntropyLossLayer([GAP9_SoftmaxCrossEntropyLossMapper]), + SoftmaxCrossEntropyLossLayer([GAP9_SoftmaxCrossEntropyLossDualOutputMapper, GAP9_SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([GAP9_SoftmaxCrossEntropyLossGradMapper]), 'SGD': - SGDLayer([GAP9_SGDMapper]) + SGDLayer([GAP9_SGDMapper]), + 'InPlaceAccumulatorV2': + InPlaceAccumulatorV2Layer([GAP9_InPlaceAccumulatorV2Mapper]), + 'BatchNormInternal': + BatchNormInternalLayer([GAP9_BatchNormInternalMapper]), + 'BatchNormalizationGrad': + BatchNormalizationGradLayer([GAP9_BatchNormalizationGradMapper]), + 'GlobalAveragePool': + GlobalAveragePoolLayer([GAP9_GlobalAveragePoolMapper]), + 'GlobalAveragePoolGrad': + GlobalAveragePoolGradLayer([GAP9_GlobalAveragePoolGradMapper]), + # Split BN ops (for spatial tiling) + 'WelfordReduce': + WelfordReduceLayer([GAP9_WelfordReduceMapper]), + 'ChannelNormalize': + ChannelNormalizeLayer([GAP9_ChannelNormalizeMapper]), + 'BNGradReduce': + BNGradReduceLayer([GAP9_BNGradReduceMapper]), + 'BNGradNormalize': + BNGradNormalizeLayer([GAP9_BNGradNormalizeMapper]), } diff --git a/Deeploy/Targets/GAP9/Tiler.py b/Deeploy/Targets/GAP9/Tiler.py index fefe12b6d7..1a07dfe0bf 100644 --- a/Deeploy/Targets/GAP9/Tiler.py +++ b/Deeploy/Targets/GAP9/Tiler.py @@ -10,13 +10,24 @@ import copy -from Deeploy.Targets.GAP9.Bindings import GAP9AddBindings, GAP9ConcatBindings, GAP9FloatConv2DBindings, \ - GAP9FloatDWConv2DBindings, GAP9FloatGELUBinding, GAP9FloatGEMMBindings, GAP9GatherBindings, \ - GAP9iHardswishBindings, GAP9iRMSNormBindings, GAP9iRQSGELUBindings, GAP9LayernormBinding, GAP9MatMulBindings, \ - GAP9MaxPool2DBindings, GAP9MulBindings, GAP9ReduceSumBindings, GAP9ReluBinding, GAP9ReshapeBindings, \ +from Deeploy.Targets.GAP9.Bindings import GAP9AddBindings, GAP9AveragePool2DBindings, \ + GAP9AveragePoolGrad2DBindings, GAP9BatchNormInternalBindings, GAP9BatchNormalizationGradBindings, \ + GAP9BNGradNormalizeBindings, GAP9BNGradReduceBindings, GAP9ChannelNormalizeBindings, \ + GAP9WelfordReduceBindings, \ + GAP9ConcatBindings, GAP9FloatConv2DBindings, GAP9FloatConvGradBBindings, GAP9FloatConvGradW2DBindings, \ + GAP9FloatConvGradX2DBindings, GAP9FloatDWConv2DBindings, GAP9FloatDWConvGradW2DBindings, \ + GAP9FloatDWConvGradX2DBindings, GAP9FloatGELUBinding, GAP9FloatGELUGradBinding, GAP9FloatGEMMBindings, \ + GAP9FloatPWConvGradW2DBindings, GAP9FloatPWConvGradX2DBindings, GAP9GatherBindings, \ + GAP9GlobalAveragePool2DBindings, GAP9GlobalAveragePoolGrad2DBindings, \ + GAP9iHardswishBindings, GAP9InPlaceAccumulatorV2Bindings, GAP9InPlaceAccumulatorV2TiledBindings, GAP9iRMSNormBindings, GAP9iRQSGELUBindings, \ + GAP9LayernormBinding, GAP9LayernormGradBinding, GAP9LayernormInferenceBinding, \ + GAP9MatMulBindings, GAP9MaxPool2DBindings, \ + GAP9MaxPoolGrad2DBindings, GAP9MSELossBindings, GAP9MSELossGradBindings, \ + GAP9MulBindings, GAP9ReduceSumBindings, GAP9ReluBinding, GAP9ReluGradBinding, GAP9ReshapeBindings, \ GAP9RQAddBindings, GAP9RQSBindings, GAP9RQSConv2DBindings, GAP9RQSDWConv2DBindings, GAP9RQSGEMMBindings, \ GAP9RQSiHardswishBindings, GAP9RQSMatrixVecBindings, GAP9RQSTallGEMMBindings, GAP9SGDBindings, \ - GAP9SoftmaxBindings, GAP9SoftmaxCrossEntropyLossBindings, GAP9SoftmaxCrossEntropyLossGradBindings, \ + GAP9SoftmaxBindings, GAP9SoftmaxCrossEntropyLossBindings, GAP9SoftmaxCrossEntropyLossDualOutputBindings, \ + GAP9SliceBindings, GAP9SoftmaxCrossEntropyLossGradBindings, \ GAP9SoftmaxGradBindings, GAP9TransposeBindings, GAP9UniformRQSBindings from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint @@ -29,19 +40,39 @@ from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.InPlaceAccumulatorV2TileConstraint import InPlaceAccumulatorV2TileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ RQDWConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.AveragePoolTileConstraint import AveragePoolCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.BatchNormTileConstraint import BatchNormInternalTileConstraint, \ + BatchNormalizationGradTileConstraint, WelfordReduceTileConstraint, ChannelNormalizeTileConstraint, \ + BNGradReduceTileConstraint, BNGradNormalizeTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ConvGradConstraint import ConvGradBTileConstraint, \ + ConvGradX2DIm2ColHWTileConstraint, ConvGradW2DTileConstraint, \ + DWConvGradX2DTileConstraint, DWConvGradW2DTileConstraint, PWConvGradXTileConstraint, PWConvGradWTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GeluTileConstraint import GeluGradTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GlobalAveragePoolTileConstraint import GlobalAveragePoolTileConstraint, \ + GlobalAveragePoolGradTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import SoftmaxGradTileConstraint, \ + iSoftmaxTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormGradTileConstraint, \ + LayernormTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolGradTileConstraint import MaxPoolGradCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MSELossTileConstraint import MSELossTileConstraint, \ + MSELossGradTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import ReluGradTileConstraint, SGDTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyLossDualOutputTileConstraint import \ + SoftmaxCrossEntropyLossDualOutputTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings # GAP9-specific tiling ready bindings using ClDma @@ -119,8 +150,10 @@ GAP9ReluTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9ReluBinding], tileConstraint = UnaryTileConstraint()) -GAP9LayernormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9LayernormBinding], - tileConstraint = LayernormTileConstraint()) +# Training LayerNorm (3 outputs: Y, mean, inv_std) tried first, then inference (1 output) +GAP9LayernormTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = [GAP9LayernormBinding, GAP9LayernormInferenceBinding], + tileConstraint = LayernormTileConstraint()) GAP9FPGELUTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9FloatGELUBinding], tileConstraint = UnaryTileConstraint()) @@ -131,14 +164,94 @@ GAP9SoftmaxCrossEntropyTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = GAP9SoftmaxCrossEntropyLossBindings, tileConstraint = SoftmaxCrossEntropyTileConstraint()) +GAP9SoftmaxCrossEntropyLossDualOutputTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9SoftmaxCrossEntropyLossDualOutputBindings, tileConstraint = SoftmaxCrossEntropyLossDualOutputTileConstraint()) + GAP9SoftmaxCrossEntropyGradTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = GAP9SoftmaxCrossEntropyLossGradBindings, tileConstraint = SoftmaxCrossEntropyGradTileConstraint()) GAP9SoftmaxGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SoftmaxGradBindings, - tileConstraint = UntiledTileConstraint()) + tileConstraint = SoftmaxGradTileConstraint()) GAP9ReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9ReduceSumBindings, - tileConstraint = UntiledTileConstraint()) + tileConstraint = ReduceSumTileConstraint()) GAP9SGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9SGDBindings, tileConstraint = SGDTileConstraint()) + +# ── Training / Gradient tiling ready bindings ──────────────────────────── + +GAP9ReluGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9ReluGradBinding], + tileConstraint = ReluGradTileConstraint()) + +GAP9FPGELUGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9FloatGELUGradBinding], + tileConstraint = GeluGradTileConstraint()) + +GAP9LayernormGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [GAP9LayernormGradBinding], + tileConstraint = LayernormGradTileConstraint()) + +GAP9ConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatConvGradX2DBindings, + tileConstraint = ConvGradX2DIm2ColHWTileConstraint()) + +GAP9ConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatConvGradW2DBindings, + tileConstraint = ConvGradW2DTileConstraint()) + +GAP9DWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatDWConvGradX2DBindings, + tileConstraint = DWConvGradX2DTileConstraint()) + +GAP9DWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatDWConvGradW2DBindings, + tileConstraint = DWConvGradW2DTileConstraint()) + +GAP9PWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatPWConvGradW2DBindings, + tileConstraint = PWConvGradWTileConstraint()) + +GAP9PWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9FloatPWConvGradX2DBindings, + tileConstraint = PWConvGradXTileConstraint()) + +GAP9ConvGradBTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9FloatConvGradBBindings, tileConstraint = ConvGradBTileConstraint()) + +GAP9MaxPoolGrad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9MaxPoolGrad2DBindings, + tileConstraint = MaxPoolGradCTileConstraint()) + +GAP9AveragePool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9AveragePool2DBindings, + tileConstraint = AveragePoolCTileConstraint()) + +GAP9AveragePoolGrad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = GAP9AveragePoolGrad2DBindings, + tileConstraint = AveragePoolCTileConstraint()) + +GAP9GlobalAveragePool2DTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9GlobalAveragePool2DBindings, tileConstraint = GlobalAveragePoolTileConstraint()) + +GAP9GlobalAveragePoolGrad2DTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9GlobalAveragePoolGrad2DBindings, tileConstraint = GlobalAveragePoolGradTileConstraint()) + +GAP9MSELossTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9MSELossBindings, tileConstraint = MSELossTileConstraint()) + +GAP9MSELossGradTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9MSELossGradBindings, tileConstraint = MSELossGradTileConstraint()) + +GAP9InPlaceAccumulatorV2TilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9InPlaceAccumulatorV2TiledBindings, tileConstraint = InPlaceAccumulatorV2TileConstraint()) + +GAP9BatchNormInternalTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9BatchNormInternalBindings, tileConstraint = BatchNormInternalTileConstraint()) + +GAP9BatchNormalizationGradTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9BatchNormalizationGradBindings, tileConstraint = BatchNormalizationGradTileConstraint()) + +GAP9WelfordReduceTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9WelfordReduceBindings, tileConstraint = WelfordReduceTileConstraint()) + +GAP9ChannelNormalizeTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9ChannelNormalizeBindings, tileConstraint = ChannelNormalizeTileConstraint()) + +GAP9BNGradReduceTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9BNGradReduceBindings, tileConstraint = BNGradReduceTileConstraint()) + +GAP9BNGradNormalizeTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9BNGradNormalizeBindings, tileConstraint = BNGradNormalizeTileConstraint()) + +GAP9SliceTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = GAP9SliceBindings, tileConstraint = SliceTileConstraint()) diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..8e3539962e 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -14,17 +14,20 @@ from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, ConvTemplate, \ ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \ FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \ - FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \ - FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \ - GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \ - MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ - RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \ - iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate + FloatLayernormTemplate, FloatMatMulTemplate, \ + FloatMaxPoolTemplate, \ + FloatMulTemplate, FloatPadTemplate, FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \ + FloatSoftmaxTemplate, FloatSqrtTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, \ + ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, \ + ReduceMeanTemplate, ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, \ + RQSiGELUTemplate, SliceTemplate, TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, \ + iSoftmaxTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ - LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ - ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, \ + PadChecker, \ + QuantChecker, ReduceMeanChecker, ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, \ + RQIntegerDivChecker, SliceChecker, SoftmaxChecker, TransposeChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -142,12 +145,7 @@ BasicITAPartialSoftmaxBinding = NodeBinding(SoftmaxChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), ITAPartialMaxTemplate.referenceTemplate, BasicTransformer) -BasicLayerNormBindings = [ - NodeBinding( - LayerNormChecker([PointerClass(int8_t), PointerClass(int32_t), - PointerClass(int32_t)], [PointerClass(int8_t)]), iLayernormTemplate.referenceTemplate, - BasicTransformer) -] + [ +BasicLayerNormBindings = [ NodeBinding( LayerNormChecker( [PointerClass(float32_t), PointerClass(float32_t), @@ -327,3 +325,4 @@ ConvTransposeTemplate.referenceTemplate, BasicTransformer) for type in FloatDataTypes ] + diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..7ea8e4774a 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -358,7 +358,8 @@ def __init__(self, maps: List[NodeMapper]): def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, channels_first) -> Tuple[Shape, Shape]: if len(inputShapes) == 3: - inputShapes[2] = inputShapes[1][0] + # Bias shape must be a list, not a scalar integer, to avoid corrupting tensor shape in export + inputShapes[2] = [inputShapes[1][0]] return (inputShapes, outputShapes) def computeOps(self): @@ -414,6 +415,35 @@ def computeOps(self): total_ops = data_out_size * comparisons_per_window return total_ops +class GlobalAveragePoolLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + +class GlobalAveragePoolGradLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + +class AveragePoolLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + +class AveragePoolGradLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + +class MaxPoolGradLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + class ReduceMeanLayer(ONNXLayer): @@ -447,6 +477,15 @@ def computeOps(self): return self.mapper.parser.operatorRepresentation['size'] +class ReluGradLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + + class LayerNormLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): @@ -474,6 +513,18 @@ def __init__(self, maps: List[NodeMapper]): super().__init__(maps) +class MSELossLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + +class MSELossGradLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + class SoftmaxCrossEntropyLossLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): @@ -491,6 +542,11 @@ class SGDLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): super().__init__(maps) + def computeOps(self): + + size = self.mapper.parser.operatorRepresentation['size'] + return size * 2 + class LinearAttentionLayer(ONNXLayer): @@ -663,6 +719,194 @@ def computeOps(self): return B * C * W * 5 +class BatchNormInternalLayer(ONNXLayer): + """Layer for ORT BatchNormInternal (training-mode BN forward pass).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + # 2 passes over N*C*H*W plus per-channel reductions + return N * C * H_in * W_in * 7 + + +class BatchNormalizationGradLayer(ONNXLayer): + """Layer for ORT BatchNormalizationGrad (BN backward pass).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + # 2 passes for reductions + 1 pass for dX + return N * C * H_in * W_in * 10 + + +class WelfordReduceLayer(ONNXLayer): + """Layer for WelfordReduce (split BN forward reduction).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + return N * C * H_in * W_in * 3 + + +class ChannelNormalizeLayer(ONNXLayer): + """Layer for ChannelNormalize (split BN forward elementwise).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + return N * C * H_in * W_in * 4 + + +class BNGradReduceLayer(ONNXLayer): + """Layer for BNGradReduce (split BN backward reduction).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + return N * C * H_in * W_in * 5 + + +class BNGradNormalizeLayer(ONNXLayer): + """Layer for BNGradNormalize (split BN backward elementwise).""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + N = opRep['N'] + C = opRep['C'] + H_in = opRep['H_in'] + W_in = opRep['W_in'] + return N * C * H_in * W_in * 5 + + +class ConvGradXLayer(ONNXLayer): + """Layer for computing input gradients in convolution backward pass""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + """ + ConvGradX computes gradients w.r.t. input. + Similar computation to ConvTranspose: for each input pixel, + we need to compute contributions from all output pixels in the receptive field. + + Operations: kernel_h * kernel_w * ch_in * ch_out * dim_im_in_x * dim_im_in_y * 2 + """ + opRep = self.mapper.parser.operatorRepresentation + + if "group" in opRep: + groups = opRep['group'] + else: + groups = 1 + + kernel_shape = int(np.prod(opRep['kernel_shape'])) + ch_in = opRep['ch_im_in'] + ch_out = opRep['ch_im_out'] + + # Operations per input pixel + opsPerPx = int(kernel_shape * ch_in * ch_out / groups) * 2 + + # Number of input pixels (output of ConvGradX) + if 'dim_im_in_y' in opRep: + numPx = opRep['dim_im_in_x'] * opRep['dim_im_in_y'] + else: + numPx = opRep['dim_im_in_x'] + + return numPx * opsPerPx + + +class ConvGradWLayer(ONNXLayer): + """Layer for computing weight gradients in convolution backward pass""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + """ + ConvGradW computes gradients w.r.t. weights. + For each weight element, we accumulate contributions from all output positions. + + Weight size: kernel_h * kernel_w * ch_in * ch_out (or ch_out for DW conv) + For each weight: dim_im_out_x * dim_im_out_y * 2 operations + + Total: kernel_h * kernel_w * ch_in * ch_out * dim_im_out_x * dim_im_out_y * 2 + """ + opRep = self.mapper.parser.operatorRepresentation + + if "group" in opRep: + groups = opRep['group'] + else: + groups = 1 + + kernel_shape = int(np.prod(opRep['kernel_shape'])) + ch_in = opRep['ch_im_in'] + ch_out = opRep['ch_im_out'] + + # Number of output spatial positions + if 'dim_im_out_y' in opRep: + num_output_positions = opRep['dim_im_out_x'] * opRep['dim_im_out_y'] + else: + num_output_positions = opRep['dim_im_out_x'] + + # For depthwise convolution (groups == ch_in == ch_out) + if groups == ch_in and groups == ch_out: + # DW: kernel_h * kernel_w * ch_out weights + num_weights = kernel_shape * ch_out + else: + # Regular or grouped conv: kernel_h * kernel_w * (ch_in/groups) * ch_out weights + num_weights = int(kernel_shape * ch_in * ch_out / groups) + + # Each weight needs to be computed from all output positions (MAC operation) + total_ops = num_weights * num_output_positions * 2 + + return total_ops + + +class ConvGradBLayer(ONNXLayer): + """Layer for computing bias gradients in convolution backward pass""" + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + """ConvGradB: dB[c] = sum_{n,h,w} dY[n,c,h,w].""" + opRep = self.mapper.parser.operatorRepresentation + return opRep['batch'] * opRep['ch_im_out'] * opRep['dim_im_out_x'] * opRep['dim_im_out_y'] + + class ConvTransposeLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): @@ -709,3 +953,19 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class InPlaceAccumulatorV2Layer(ONNXLayer): + """Layer for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Gradient accumulation with optional reset: + if lazy_reset_grad: out = gradient + else: out = buffer + gradient + """ + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # One conditional check + one element-wise op (copy or add) per element + return self.mapper.parser.operatorRepresentation['size'] diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..6c73347643 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -329,6 +329,135 @@ def parseNodeCtxt(self, return newCtxt, wellFormed +class AveragePool2DParser(MaxPool2DParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + ret = super().parseNode(node) + wellFormed = False + if ret: + pads = self.operatorRepresentation['pads'] + kernel_shape = self.operatorRepresentation['kernel_shape'] + strides = self.operatorRepresentation['strides'] + if len(pads) == 4 and len(kernel_shape) == 2 and len(strides) == 2: + wellFormed = True + self.operatorRepresentation['padding_x_left'] = int(pads[0]) + self.operatorRepresentation['padding_y_top'] = int(pads[1]) + self.operatorRepresentation['padding_x_right'] = int(pads[2]) + self.operatorRepresentation['padding_y_bottom'] = int(pads[3]) + self.operatorRepresentation['stride_x'] = int(strides[0]) + self.operatorRepresentation['stride_y'] = int(strides[1]) + self.operatorRepresentation['dim_kernel_x'] = int(kernel_shape[0]) + self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[1]) + return wellFormed + + def parseNodeCtxt(self, ctxt, node, channels_first=True): + newCtxt, ret = super(MaxPool2DParser, self).parseNodeCtxt(ctxt, node, channels_first) + wellFormed = False + if ret: + data_in = newCtxt.lookup(self.operatorRepresentation['data_in']) + data_out = newCtxt.lookup(self.operatorRepresentation['data_out']) + self.operatorRepresentation['batch'] = data_in.shape[0] + if channels_first: + self.operatorRepresentation['ch_im_in'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3] + self.operatorRepresentation['ch_im_out'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3] + else: + self.operatorRepresentation['ch_im_in'] = data_in.shape[3] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2] + self.operatorRepresentation['ch_im_out'] = data_out.shape[3] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2] + if len(data_in.shape) == 4 and len(data_out.shape) == 4: + wellFormed = True + return newCtxt, wellFormed + + +class MaxPoolGradParser(NodeParser): + """Parser for MaxPoolGrad custom training operator. + + Inputs: + 0: grad_output (dY) - upstream gradient, shape [N, C, Ho, Wo] or [N, Ho, Wo, C] + 1: original_input (X) - forward input, shape [N, C, Hi, Wi] or [N, Hi, Wi, C] + Output: + 0: grad_input (dX) - gradient w.r.t. forward input, same shape as original_input + Attributes: kernel_shape, strides, pads (same as MaxPool) + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + ret = all([ + 'kernel_shape' in node.attrs, + 'pads' in node.attrs, + 'strides' in node.attrs, + len(node.inputs) == 2, + len(node.outputs) == 1, + ]) + + if ret: + pads = node.attrs['pads'] + kernel_shape = node.attrs['kernel_shape'] + strides = node.attrs['strides'] + + if not (len(pads) == 4 and len(kernel_shape) == 2 and len(strides) == 2): + return False + + self.operatorRepresentation['pads'] = pads + self.operatorRepresentation['kernel_shape'] = kernel_shape + self.operatorRepresentation['strides'] = strides + self.operatorRepresentation['padding_x_left'] = int(pads[0]) + self.operatorRepresentation['padding_y_top'] = int(pads[1]) + self.operatorRepresentation['padding_x_right'] = int(pads[2]) + self.operatorRepresentation['padding_y_bottom'] = int(pads[3]) + self.operatorRepresentation['stride_x'] = int(strides[0]) + self.operatorRepresentation['stride_y'] = int(strides[1]) + self.operatorRepresentation['dim_kernel_x'] = int(kernel_shape[0]) + self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[1]) + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + x_in = ctxt.lookup(node.inputs[1].name) + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['x_in'] = x_in.name + self.operatorRepresentation['data_out'] = data_out.name + + if channels_first: + self.operatorRepresentation['batch'] = data_in.shape[0] + self.operatorRepresentation['ch_im_in'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3] + self.operatorRepresentation['ch_im_out'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3] + else: + self.operatorRepresentation['batch'] = data_in.shape[0] + self.operatorRepresentation['ch_im_in'] = data_in.shape[3] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2] + self.operatorRepresentation['ch_im_out'] = data_out.shape[3] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2] + + wellFormed = (len(data_in.shape) == 4 and len(x_in.shape) == 4 and len(data_out.shape) == 4) + return ctxt, wellFormed + + class PadParser(NodeParser): def __init__(self): @@ -1117,6 +1246,33 @@ def parseNodeCtxt(self, return ctxt, True +class ReluGradParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + upstream_grad = ctxt.lookup(node.inputs[0].name) + relu_input = ctxt.lookup(node.inputs[1].name) + relu_grad = ctxt.lookup(node.outputs[0].name) + + self.operatorRepresentation['grad_out'] = upstream_grad.name + self.operatorRepresentation['data_in'] = relu_input.name + self.operatorRepresentation['grad_in'] = relu_grad.name + self.operatorRepresentation['size'] = np.prod(upstream_grad.shape) + + return ctxt, True + + class ReshapeParser(NodeParser): def parseNode(self, node: gs.Node) -> (bool): @@ -1694,12 +1850,15 @@ def parseNodeCtxt(self, channels_first: bool = True) -> Tuple[NetworkContext, bool]: inputs = ['data_in', 'weight', 'bias'] - outputs = ['data_out'] + # ONNX LayerNormalization can have up to 3 outputs: Y, mean, inv_std_dev. + # The extra outputs are needed by LayerNormalizationGrad in training graphs. + outputs = ['data_out', 'mean', 'inv_std_dev'] for idx, inputNode in enumerate(node.inputs): self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name for idx, outputNode in enumerate(node.outputs): - self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name + if idx < len(outputs): + self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape) self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1] @@ -1711,7 +1870,9 @@ class LayerNormGradParser(iLayerNormParser): def parseNode(self, node: gs.Node) -> (bool): - ret = all(['epsilon' in node.attrs, len(node.inputs) == 4, len(node.outputs) == 1]) + # ONNX LayerNormalizationGrad has 5 inputs [dY, X, scale, mean, inv_std_dev] + # and 3 outputs [dX, dscale, dbias]. + ret = all(['epsilon' in node.attrs, len(node.inputs) == 5, len(node.outputs) == 3]) if ret: self.operatorRepresentation['epsilon'] = node.attrs['epsilon'] @@ -1723,8 +1884,12 @@ def parseNodeCtxt(self, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - inputs = ['grad_in', 'data_in', 'weight', 'bias'] - outputs = ['grad_out'] + # inputs: [dY, X, scale, mean, inv_std_dev] + # mean and inv_std_dev are not passed to the kernel (recomputed internally), + # but are mapped so Deeploy can track them. + inputs = ['grad_in', 'data_in', 'weight', 'mean', 'inv_std_dev'] + # outputs: [dX, dscale, dbias] + outputs = ['grad_out', 'weight_grad', 'bias_grad'] for idx, inputNode in enumerate(node.inputs): self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name @@ -2617,7 +2782,8 @@ def __init__(self): def parseNode(self, node: gs.Node) -> bool: - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) + # Accept 1 output (log_prob only) or 2 outputs (loss + log_prob) + ret = all([len(node.inputs) == 2, len(node.outputs) in (1, 2)]) return ret @@ -2628,7 +2794,15 @@ def parseNodeCtxt(self, logits = ctxt.lookup(node.inputs[0].name) labels = ctxt.lookup(node.inputs[1].name) - log_prob = ctxt.lookup(node.outputs[0].name) + if len(node.outputs) == 2: + # Dual-output: outputs[0]=loss (scalar), outputs[1]=log_prob + loss = ctxt.lookup(node.outputs[0].name) + log_prob = ctxt.lookup(node.outputs[1].name) + self.operatorRepresentation['loss'] = loss.name + else: + # Single-output (legacy): outputs[0]=log_prob + log_prob = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['loss'] = '' self.operatorRepresentation['logits'] = logits.name self.operatorRepresentation['labels'] = labels.name self.operatorRepresentation['log_prob'] = log_prob.name @@ -2668,6 +2842,64 @@ def parseNodeCtxt(self, return ctxt, True +class MSELossParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + return all([len(node.inputs) == 2, len(node.outputs) == 1]) + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + pred = ctxt.lookup(node.inputs[0].name) + target = ctxt.lookup(node.inputs[1].name) + loss = ctxt.lookup(node.outputs[0].name) + + num_elements = 1 + for d in pred.shape: + num_elements *= d + + self.operatorRepresentation['pred'] = pred.name + self.operatorRepresentation['target'] = target.name + self.operatorRepresentation['loss'] = loss.name + self.operatorRepresentation['num_elements'] = num_elements + + return ctxt, True + + +class MSELossGradParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + return all([len(node.inputs) == 2, len(node.outputs) == 1]) + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + pred = ctxt.lookup(node.inputs[0].name) + target = ctxt.lookup(node.inputs[1].name) + grad = ctxt.lookup(node.outputs[0].name) + + num_elements = 1 + for d in pred.shape: + num_elements *= d + + self.operatorRepresentation['pred'] = pred.name + self.operatorRepresentation['target'] = target.name + self.operatorRepresentation['grad'] = grad.name + self.operatorRepresentation['num_elements'] = num_elements + + return ctxt, True + + class SGDParser(NodeParser): def __init__(self): @@ -2736,6 +2968,349 @@ def parseNodeCtxt(self, ctxt, node: gs.Node, channels_first: bool = True): return ctxt, True +class BatchNormInternalParser(NodeParser): + """Parser for ORT BatchNormInternal (training_mode=1). + + Inputs (5): X, scale (gamma), B (beta), running_mean, running_var + Outputs (5): Y, updated_running_mean, updated_running_var, saved_mean, saved_inv_std + + Outputs[1,2] (updated running stats) have no consumers in the graph and are + allocated but never written by the kernel. + Outputs[3,4] (saved_mean, saved_inv_std) are consumed by BatchNormalizationGrad. + Their ONNX shapes are '?' — the parser infers them as [C]. + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 5: + return False + if len(node.outputs) != 5: + return False + self.operatorRepresentation['epsilon'] = float(node.attrs.get('epsilon', 1e-5)) + self.operatorRepresentation['momentum'] = float(node.attrs.get('momentum', 0.9)) + self.operatorRepresentation['training_mode'] = int(node.attrs.get('training_mode', 1)) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Inputs + self.operatorRepresentation['data_in'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['scale'] = ctxt.lookup(node.inputs[1].name).name + self.operatorRepresentation['bias'] = ctxt.lookup(node.inputs[2].name).name + self.operatorRepresentation['running_mean'] = ctxt.lookup(node.inputs[3].name).name + self.operatorRepresentation['running_var'] = ctxt.lookup(node.inputs[4].name).name + + # Outputs + self.operatorRepresentation['data_out'] = ctxt.lookup(node.outputs[0].name).name + self.operatorRepresentation['updated_running_mean'] = ctxt.lookup(node.outputs[1].name).name + self.operatorRepresentation['updated_running_var'] = ctxt.lookup(node.outputs[2].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.outputs[3].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.outputs[4].name).name + + # Spatial shape from input[0] + input_shape = ctxt.lookup(node.inputs[0].name).shape + N = int(input_shape[0]) + C = int(input_shape[1]) + H_in = int(input_shape[2]) + W_in = int(input_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + # Fix unknown shapes for saved_mean and saved_inv_std (ONNX reports '?') + for out_idx in [3, 4]: + buf = ctxt.lookup(node.outputs[out_idx].name) + if buf.shape is None or (hasattr(buf.shape, '__len__') and len(buf.shape) == 0): + buf.shape = (C,) + + return ctxt, True + + +class BatchNormalizationGradParser(NodeParser): + """Parser for ORT BatchNormalizationGrad (backward pass). + + Inputs (5): dY, X, scale (gamma), saved_mean, saved_inv_std + Outputs (3): dX, dgamma, dbeta + + dgamma and dbeta have shape '?' in ONNX — inferred as [C] from input[2] (gamma). + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 5: + return False + if len(node.outputs) != 3: + return False + self.operatorRepresentation['epsilon'] = float(node.attrs.get('epsilon', 1e-5)) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Inputs + self.operatorRepresentation['dY'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['X'] = ctxt.lookup(node.inputs[1].name).name + self.operatorRepresentation['gamma'] = ctxt.lookup(node.inputs[2].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.inputs[3].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.inputs[4].name).name + + # Outputs + self.operatorRepresentation['dX'] = ctxt.lookup(node.outputs[0].name).name + self.operatorRepresentation['dgamma'] = ctxt.lookup(node.outputs[1].name).name + self.operatorRepresentation['dbeta'] = ctxt.lookup(node.outputs[2].name).name + + # Shape from dY (input[0]) + dy_shape = ctxt.lookup(node.inputs[0].name).shape + N = int(dy_shape[0]) + C = int(dy_shape[1]) + H_in = int(dy_shape[2]) + W_in = int(dy_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + # Fix unknown shapes for dgamma and dbeta (ONNX reports '?') + for out_idx in [1, 2]: + buf = ctxt.lookup(node.outputs[out_idx].name) + if buf.shape is None or (hasattr(buf.shape, '__len__') and len(buf.shape) == 0): + buf.shape = (C,) + + return ctxt, True + + +class WelfordReduceParser(NodeParser): + """Parser for WelfordReduce (split BN forward reduction). + + Inputs (1): X [N,C,H,W] + Outputs (2): saved_mean [C], saved_inv_std [C] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 1 or len(node.outputs) != 2: + return False + self.operatorRepresentation['epsilon'] = float(node.attrs.get('epsilon', 1e-5)) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['data_in'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.outputs[0].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.outputs[1].name).name + + in_shape = ctxt.lookup(node.inputs[0].name).shape + N, C, H_in, W_in = int(in_shape[0]), int(in_shape[1]), int(in_shape[2]), int(in_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + # Fix unknown shapes for outputs + for out_idx in [0, 1]: + buf = ctxt.lookup(node.outputs[out_idx].name) + if buf.shape is None or (hasattr(buf.shape, '__len__') and len(buf.shape) == 0): + buf.shape = (C,) + + return ctxt, True + + +class ChannelNormalizeParser(NodeParser): + """Parser for ChannelNormalize (split BN forward elementwise). + + Inputs (5): X [N,C,H,W], saved_mean [C], saved_inv_std [C], gamma [C], beta [C] + Outputs (1): Y [N,C,H,W] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 5 or len(node.outputs) != 1: + return False + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['data_in'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.inputs[1].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.inputs[2].name).name + self.operatorRepresentation['gamma'] = ctxt.lookup(node.inputs[3].name).name + self.operatorRepresentation['beta'] = ctxt.lookup(node.inputs[4].name).name + self.operatorRepresentation['data_out'] = ctxt.lookup(node.outputs[0].name).name + + in_shape = ctxt.lookup(node.inputs[0].name).shape + N, C, H_in, W_in = int(in_shape[0]), int(in_shape[1]), int(in_shape[2]), int(in_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + return ctxt, True + + +class BNGradReduceParser(NodeParser): + """Parser for BNGradReduce (split BN backward reduction). + + Inputs (4): dY [N,C,H,W], X [N,C,H,W], saved_mean [C], saved_inv_std [C] + Outputs (2): dgamma [C], dbeta [C] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 4 or len(node.outputs) != 2: + return False + self.operatorRepresentation['epsilon'] = float(node.attrs.get('epsilon', 1e-5)) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['dY'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['X'] = ctxt.lookup(node.inputs[1].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.inputs[2].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.inputs[3].name).name + self.operatorRepresentation['dgamma'] = ctxt.lookup(node.outputs[0].name).name + self.operatorRepresentation['dbeta'] = ctxt.lookup(node.outputs[1].name).name + + dy_shape = ctxt.lookup(node.inputs[0].name).shape + N, C, H_in, W_in = int(dy_shape[0]), int(dy_shape[1]), int(dy_shape[2]), int(dy_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + # Fix unknown shapes for outputs + for out_idx in [0, 1]: + buf = ctxt.lookup(node.outputs[out_idx].name) + if buf.shape is None or (hasattr(buf.shape, '__len__') and len(buf.shape) == 0): + buf.shape = (C,) + + return ctxt, True + + +class BNGradNormalizeParser(NodeParser): + """Parser for BNGradNormalize (split BN backward elementwise). + + Inputs (7): dY, X, saved_mean, saved_inv_std, gamma, dgamma, dbeta + Outputs (1): dX [N,C,H,W] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 7 or len(node.outputs) != 1: + return False + self.operatorRepresentation['epsilon'] = float(node.attrs.get('epsilon', 1e-5)) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['dY'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['X'] = ctxt.lookup(node.inputs[1].name).name + self.operatorRepresentation['saved_mean'] = ctxt.lookup(node.inputs[2].name).name + self.operatorRepresentation['saved_inv_std'] = ctxt.lookup(node.inputs[3].name).name + self.operatorRepresentation['gamma'] = ctxt.lookup(node.inputs[4].name).name + self.operatorRepresentation['dgamma'] = ctxt.lookup(node.inputs[5].name).name + self.operatorRepresentation['dbeta'] = ctxt.lookup(node.inputs[6].name).name + self.operatorRepresentation['dX'] = ctxt.lookup(node.outputs[0].name).name + + dy_shape = ctxt.lookup(node.inputs[0].name).shape + N, C, H_in, W_in = int(dy_shape[0]), int(dy_shape[1]), int(dy_shape[2]), int(dy_shape[3]) + self.operatorRepresentation['N'] = N + self.operatorRepresentation['C'] = C + self.operatorRepresentation['H_in'] = H_in + self.operatorRepresentation['W_in'] = W_in + + # Compute N_total_inv with FULL spatial dims for BN gradient formula + self.operatorRepresentation['N_total_inv'] = 1.0 / float(N * H_in * W_in) + + return ctxt, True + + +class GlobalAveragePoolParser(NodeParser): + """Parser for GlobalAveragePool (NCHW). + + Input: data_in [N, C, H, W] + Output: data_out [N, C, 1, 1] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 1: + return False + if len(node.outputs) != 1: + return False + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['data_in'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['data_out'] = ctxt.lookup(node.outputs[0].name).name + + in_shape = ctxt.lookup(node.inputs[0].name).shape + if len(in_shape) != 4: + return ctxt, False + + N, C, H, W = int(in_shape[0]), int(in_shape[1]), int(in_shape[2]), int(in_shape[3]) + self.operatorRepresentation['batch'] = N + self.operatorRepresentation['channels'] = C + self.operatorRepresentation['dim_im_in_x'] = H + self.operatorRepresentation['dim_im_in_y'] = W + self.operatorRepresentation['size'] = H * W + return ctxt, True + + +class GlobalAveragePoolGradParser(NodeParser): + """Parser for fused GlobalAveragePoolGrad. + + Input: dY [N, C, 1, 1] (stored as N*C elements) + Output: dX [N, C, H, W] + Attrs: kernel_shape=[H, W] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) != 1: + return False + if len(node.outputs) != 1: + return False + if 'kernel_shape' not in node.attrs: + return False + kernel_shape = node.attrs['kernel_shape'] + if len(kernel_shape) != 2: + return False + self.operatorRepresentation['H'] = int(kernel_shape[0]) + self.operatorRepresentation['W'] = int(kernel_shape[1]) + return True + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + self.operatorRepresentation['dY'] = ctxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['dX'] = ctxt.lookup(node.outputs[0].name).name + + dy_shape = ctxt.lookup(node.inputs[0].name).shape + if len(dy_shape) < 2: + return ctxt, False + + N, C = int(dy_shape[0]), int(dy_shape[1]) + self.operatorRepresentation['batch'] = N + self.operatorRepresentation['channels'] = C + return ctxt, True + + class ConvTransposeParser(NodeParser): def __init__(self): @@ -2743,16 +3318,23 @@ def __init__(self): def parseNode(self, node: gs.Node) -> bool: # Extract ONNX attributes with defaults - strides = node.attrs.get('strides', [1]) - - pads = node.attrs.get('pads', [0, 0]) + strides = node.attrs.get('strides', [1, 1]) + pads = node.attrs.get('pads', [0, 0, 0, 0]) kernel_shape = node.attrs.get('kernel_shape', None) - dilations = node.attrs.get('dilations', [1]) + dilations = node.attrs.get('dilations', [1, 1]) group = node.attrs.get('group', 1) - # Check for required attributes - wellFormed = (kernel_shape is not None and len(node.outputs) == 1) + # Validate 2D operation + wellFormed = all([ + kernel_shape is not None, + len(node.outputs) == 1, + len(strides) == 2, + len(pads) == 4, + len(dilations) == 2, + ]) + if wellFormed: + # Store attributes self.operatorRepresentation['strides'] = strides self.operatorRepresentation['pads'] = pads self.operatorRepresentation['kernel_shape'] = kernel_shape @@ -2760,6 +3342,26 @@ def parseNode(self, node: gs.Node) -> bool: self.operatorRepresentation['group'] = group self.operatorRepresentation['nodeName'] = node.name self.operatorRepresentation['nodeOp'] = node.op + + # Set kernel dimensions + # Note: Following system convention where _x refers to H, _y refers to W + self.operatorRepresentation['dim_kernel_x'] = int(kernel_shape[0]) # kH + self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[1]) # kW + + # Set strides + self.operatorRepresentation['stride_x'] = int(strides[0]) # stride_H + self.operatorRepresentation['stride_y'] = int(strides[1]) # stride_W + + # Set dilations + self.operatorRepresentation['dilation_x'] = int(dilations[0]) # dilation_H + self.operatorRepresentation['dilation_y'] = int(dilations[1]) # dilation_W + + # Set padding (top, left, bottom, right) + self.operatorRepresentation['padding_y_top'] = int(pads[0]) + self.operatorRepresentation['padding_x_left'] = int(pads[1]) + self.operatorRepresentation['padding_y_bottom'] = int(pads[2]) + self.operatorRepresentation['padding_x_right'] = int(pads[3]) + return wellFormed def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True): @@ -2767,50 +3369,62 @@ def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: boo self.operatorRepresentation['data_in'] = node.inputs[0].name self.operatorRepresentation['weight'] = node.inputs[1].name self.operatorRepresentation['data_out'] = node.outputs[0].name + + # Handle optional bias if len(node.inputs) == 3: self.operatorRepresentation['bias'] = node.inputs[2].name self.operatorRepresentation['has_bias'] = "true" else: self.operatorRepresentation['has_bias'] = "false" - # Get output shape from context - data_out = ctxt.lookup(node.outputs[0].name) - out_shape = data_out.shape - if len(out_shape) == 3: - self.operatorRepresentation['dim_im_out_x'] = out_shape[2] - elif len(out_shape) == 4: - self.operatorRepresentation['dim_im_out_x'] = out_shape[2] - self.operatorRepresentation['dim_im_out_y'] = out_shape[3] - - stride_x, stride_y = 1, 1 - if "strides" in node.attrs: - stride_y = node.attrs["strides"][0] - stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y - self.operatorRepresentation["stride_y"] = stride_y - self.operatorRepresentation["stride_x"] = stride_x - - if "kernel_shape" in node.attrs: - kernel_shape = node.attrs["kernel_shape"] - kernel_shape_x = kernel_shape[0] - # For 2D, kernel_shape may have two elements - kernel_shape_y = kernel_shape[1] if len(kernel_shape) > 1 else kernel_shape_x - else: - kernel_shape_x = 1 - kernel_shape_y = 1 + # Get tensors from context data_in = ctxt.lookup(node.inputs[0].name) data_out = ctxt.lookup(node.outputs[0].name) + weight = ctxt.lookup(node.inputs[1].name) + + # Get shapes in_shape = data_in.shape out_shape = data_out.shape - self.operatorRepresentation['ch_im_in'] = in_shape[1] - self.operatorRepresentation['dim_im_in_y'] = in_shape[2] - self.operatorRepresentation['ch_im_out'] = out_shape[1] - self.operatorRepresentation['dim_im_out_y'] = out_shape[2] + # Validate 4D tensors (NCHW) + if len(in_shape) != 4 or len(out_shape) != 4: + return ctxt, False + + # Set batch size + self.operatorRepresentation['batch'] = in_shape[0] + + if channels_first: + # NCHW format + # Note: Following system convention where _x refers to H (shape[2]), _y refers to W (shape[3]) + self.operatorRepresentation['ch_im_in'] = in_shape[1] + self.operatorRepresentation['dim_im_in_x'] = in_shape[2] # H + self.operatorRepresentation['dim_im_in_y'] = in_shape[3] # W + + self.operatorRepresentation['ch_im_out'] = out_shape[1] + self.operatorRepresentation['dim_im_out_x'] = out_shape[2] # H + self.operatorRepresentation['dim_im_out_y'] = out_shape[3] # W + else: + # NHWC format + self.operatorRepresentation['ch_im_in'] = in_shape[3] + self.operatorRepresentation['dim_im_in_x'] = in_shape[1] # H + self.operatorRepresentation['dim_im_in_y'] = in_shape[2] # W + + self.operatorRepresentation['ch_im_out'] = out_shape[3] + self.operatorRepresentation['dim_im_out_x'] = out_shape[1] # H + self.operatorRepresentation['dim_im_out_y'] = out_shape[2] # W + + # Calculate batch offsets (elements per batch) + self.operatorRepresentation['batchOffsetIn'] = ( + self.operatorRepresentation['ch_im_in'] * + self.operatorRepresentation['dim_im_in_x'] * + self.operatorRepresentation['dim_im_in_y'] + ) + self.operatorRepresentation['batchOffsetOut'] = ( + self.operatorRepresentation['ch_im_out'] * + self.operatorRepresentation['dim_im_out_x'] * + self.operatorRepresentation['dim_im_out_y'] + ) - self.operatorRepresentation[ - 'batchOffsetIn'] = self.operatorRepresentation['ch_im_in'] * self.operatorRepresentation['dim_im_in_y'] - self.operatorRepresentation[ - 'batchOffsetOut'] = self.operatorRepresentation['ch_im_out'] * self.operatorRepresentation['dim_im_out_y'] return ctxt, True @@ -2846,8 +3460,27 @@ def parseNodeCtxt(self, channels_first: bool = True) -> Tuple[NetworkContext, bool]: newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) - if ret: + data_in = newCtxt.lookup(self.operatorRepresentation['data_in']) + data_out = newCtxt.lookup(self.operatorRepresentation['data_out']) + weight = newCtxt.lookup(self.operatorRepresentation['weight']) + + self.operatorRepresentation['batch'] = data_in.shape[0] + if channels_first: + self.operatorRepresentation['ch_im_in'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3] + self.operatorRepresentation['ch_im_out'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[2] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[3] + else: + self.operatorRepresentation['ch_im_in'] = data_in.shape[3] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2] + self.operatorRepresentation['ch_im_out'] = data_out.shape[3] + self.operatorRepresentation['dim_im_out_x'] = data_out.shape[1] + self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2] + data_in = newCtxt.lookup(node.inputs[0].name) data_out = newCtxt.lookup(node.outputs[0].name) in_shape = data_in.shape @@ -2886,3 +3519,311 @@ def parseNodeCtxt(self, self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) return ctxt, True + + +class InPlaceAccumulatorV2Parser(NodeParser): + """Parser for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Semantics: + if lazy_reset_grad: out = gradient (reset) + else: out = buffer + gradient (accumulate) + + Inputs: + 0: buffer - current accumulation buffer (float tensor) + 1: gradient - new gradient to accumulate (float tensor, same shape) + 2: lazy_reset_grad - reset flag; if true, overwrite; else add (bool[1]) + + Output: + 0: output_buffer - updated accumulation buffer (float tensor) + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + # Require exactly 3 inputs (buffer, gradient, lazy_reset_grad) and 1 output + return len(node.inputs) == 3 and len(node.outputs) == 1 + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + buffer = ctxt.lookup(node.inputs[0].name) + gradient = ctxt.lookup(node.inputs[1].name) + lazy_reset_grad = ctxt.lookup(node.inputs[2].name) + data_out = ctxt.lookup(node.outputs[0].name) + + self.operatorRepresentation['accum_buffer'] = buffer.name + self.operatorRepresentation['gradient'] = gradient.name + self.operatorRepresentation['lazy_reset_grad'] = lazy_reset_grad.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['size'] = int(np.prod(buffer.shape)) + + return ctxt, True + + +class Conv2DGradXParser(Conv2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) + + return wellFormed + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + + output_grad_name = node.inputs[0].name # dY + weight_name = node.inputs[1].name + input_grad_name = node.outputs[0].name # dX + + output_grad = ctxt.lookup(output_grad_name) # dY: [N, C_out, H_out, W_out] + weight = ctxt.lookup(weight_name) + input_grad = ctxt.lookup(input_grad_name) # dX: [N, C_in, H_in, W_in] + + self.operatorRepresentation['grad_out'] = output_grad_name # dY + self.operatorRepresentation['weight'] = weight_name + self.operatorRepresentation['grad_in'] = input_grad_name # dX + + self.operatorRepresentation['batch'] = output_grad.shape[0] + + # From input_grad (dX): [N, C_in, H_in, W_in] + self.operatorRepresentation['ch_im_in'] = input_grad.shape[1] + self.operatorRepresentation['dim_im_in_x'] = input_grad.shape[2] # H_in + self.operatorRepresentation['dim_im_in_y'] = input_grad.shape[3] # W_in + + # From output_grad (dY): [N, C_out, H_out, W_out] + self.operatorRepresentation['ch_im_out'] = output_grad.shape[1] + self.operatorRepresentation['dim_im_out_x'] = output_grad.shape[2] # H_out + self.operatorRepresentation['dim_im_out_y'] = output_grad.shape[3] # W_out + + # Initialize offset fields (will be filled during tiling) + self.operatorRepresentation['offset_grad_in_h'] = 0 + self.operatorRepresentation['offset_grad_in_w'] = 0 + self.operatorRepresentation['offset_grad_out_h'] = 0 + self.operatorRepresentation['offset_grad_out_w'] = 0 + + return newCtxt, True + + return ctxt, False + + +class Conv2DGradWParser(Conv2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) + + return wellFormed + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + + input_name = node.inputs[1].name # X + output_grad_name = node.inputs[0].name # dY + weight_grad_name = node.outputs[0].name # dW + + input = ctxt.lookup(input_name) # X: [N, C_in, H_in, W_in] + output_grad = ctxt.lookup(output_grad_name) # dY: [N, C_out, H_out, W_out] + weight_grad = ctxt.lookup(weight_grad_name) # dW: [C_out, C_in, kH, kW] + + self.operatorRepresentation['data_in'] = input_name # X + self.operatorRepresentation['grad_out'] = output_grad_name # dY + self.operatorRepresentation['grad_weight'] = weight_grad_name # dW + + self.operatorRepresentation['batch'] = output_grad.shape[0] + + self.operatorRepresentation['ch_im_in'] = input.shape[1] + self.operatorRepresentation['dim_im_in_x'] = input.shape[2] # H_in + self.operatorRepresentation['dim_im_in_y'] = input.shape[3] # W_in + + self.operatorRepresentation['ch_im_out'] = output_grad.shape[1] + self.operatorRepresentation['dim_im_out_x'] = output_grad.shape[2] # H_out + self.operatorRepresentation['dim_im_out_y'] = output_grad.shape[3] # W_out + + self.operatorRepresentation['dim_kernel_x'] = weight_grad.shape[2] # kH + self.operatorRepresentation['dim_kernel_y'] = weight_grad.shape[3] # kW + self.operatorRepresentation['kernel_shape'] = [weight_grad.shape[2], weight_grad.shape[3]] # [kH, kW] + + return newCtxt, True + + return ctxt, False + + +class Conv2DGradBParser(NodeParser): + """Parser for ConvGradB: dB[c] = sum_{n,h,w} dY[n,c,h,w]. + inputs: [dY: N,C_out,H,W], outputs: [dB: C_out] + """ + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + return len(node.inputs) == 1 and len(node.outputs) == 1 + + def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + output_grad = ctxt.lookup(node.inputs[0].name) + bias_grad = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['grad_out'] = output_grad.name + self.operatorRepresentation['grad_bias'] = bias_grad.name + self.operatorRepresentation['batch'] = output_grad.shape[0] + self.operatorRepresentation['ch_im_out'] = output_grad.shape[1] + self.operatorRepresentation['dim_im_out_x'] = output_grad.shape[2] + self.operatorRepresentation['dim_im_out_y'] = output_grad.shape[3] + return ctxt, True + + +class Conv2DGradXWParser(NodeParser): + """Combined ConvGrad no-bias: 3 inputs (dY, X, W), 2 outputs (dX, dW).""" + + def __init__(self): + super().__init__() + + def parseNode(self, node): + wellFormed = all([ + 'dilations' in node.attrs, + 'group' in node.attrs, + 'pads' in node.attrs, + 'strides' in node.attrs, + len(node.inputs) == 3, + len(node.outputs) == 2, + len(node.attrs['strides']) == 2, + len(node.attrs['pads']) == 4, + len(node.attrs['dilations']) == 2, + ]) + if wellFormed: + if 'kernel_shape' not in node.attrs: + node.attrs['kernel_shape'] = node.inputs[2].shape[-2:] + self.operatorRepresentation['group'] = node.attrs['group'] + self.operatorRepresentation['pads'] = node.attrs['pads'] + self.operatorRepresentation['strides'] = node.attrs['strides'] + self.operatorRepresentation['dilations'] = node.attrs['dilations'] + self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] + self.operatorRepresentation['dim_kernel_x'] = int(node.attrs['kernel_shape'][0]) + self.operatorRepresentation['dim_kernel_y'] = int(node.attrs['kernel_shape'][1]) + self.operatorRepresentation['stride_x'] = int(node.attrs['strides'][0]) + self.operatorRepresentation['stride_y'] = int(node.attrs['strides'][1]) + return wellFormed + + def parseNodeCtxt(self, ctxt, node, channels_first=True): + grad_out = ctxt.lookup(node.inputs[0].name) + data_in = ctxt.lookup(node.inputs[1].name) + weight = ctxt.lookup(node.inputs[2].name) + grad_in = ctxt.lookup(node.outputs[0].name) + grad_weight = ctxt.lookup(node.outputs[1].name) + + self.operatorRepresentation['grad_out'] = grad_out.name + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['weight'] = weight.name + self.operatorRepresentation['grad_in'] = grad_in.name + self.operatorRepresentation['grad_weight'] = grad_weight.name + + self.operatorRepresentation['batch'] = data_in.shape[0] + self.operatorRepresentation['ch_im_in'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3] + self.operatorRepresentation['ch_im_out'] = grad_out.shape[1] + self.operatorRepresentation['dim_im_out_x'] = grad_out.shape[2] + self.operatorRepresentation['dim_im_out_y'] = grad_out.shape[3] + + self.operatorRepresentation['offset_grad_in_h'] = 0 + self.operatorRepresentation['offset_grad_in_w'] = 0 + self.operatorRepresentation['offset_grad_out_h'] = 0 + self.operatorRepresentation['offset_grad_out_w'] = 0 + + self.operatorRepresentation['gradw_dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['gradw_dim_im_in_y'] = data_in.shape[3] + + return ctxt, True + + +class Conv2DGradXWBParser(NodeParser): + """Combined ConvGrad with bias: 4 inputs (dY, X, W, bias), 3 outputs (dX, dW, dB).""" + + def __init__(self): + super().__init__() + + def parseNode(self, node): + wellFormed = all([ + 'dilations' in node.attrs, + 'group' in node.attrs, + 'pads' in node.attrs, + 'strides' in node.attrs, + len(node.inputs) == 4, + len(node.outputs) == 3, + len(node.attrs['strides']) == 2, + len(node.attrs['pads']) == 4, + len(node.attrs['dilations']) == 2, + ]) + if wellFormed: + if 'kernel_shape' not in node.attrs: + node.attrs['kernel_shape'] = node.inputs[2].shape[-2:] + self.operatorRepresentation['group'] = node.attrs['group'] + self.operatorRepresentation['pads'] = node.attrs['pads'] + self.operatorRepresentation['strides'] = node.attrs['strides'] + self.operatorRepresentation['dilations'] = node.attrs['dilations'] + self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] + self.operatorRepresentation['dim_kernel_x'] = int(node.attrs['kernel_shape'][0]) + self.operatorRepresentation['dim_kernel_y'] = int(node.attrs['kernel_shape'][1]) + self.operatorRepresentation['stride_x'] = int(node.attrs['strides'][0]) + self.operatorRepresentation['stride_y'] = int(node.attrs['strides'][1]) + return wellFormed + + def parseNodeCtxt(self, ctxt, node, channels_first=True): + grad_out = ctxt.lookup(node.inputs[0].name) + data_in = ctxt.lookup(node.inputs[1].name) + weight = ctxt.lookup(node.inputs[2].name) + bias = ctxt.lookup(node.inputs[3].name) + grad_in = ctxt.lookup(node.outputs[0].name) + grad_weight = ctxt.lookup(node.outputs[1].name) + grad_bias = ctxt.lookup(node.outputs[2].name) + + self.operatorRepresentation['grad_out'] = grad_out.name + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['weight'] = weight.name + self.operatorRepresentation['bias'] = bias.name + self.operatorRepresentation['grad_in'] = grad_in.name + self.operatorRepresentation['grad_weight'] = grad_weight.name + self.operatorRepresentation['grad_bias'] = grad_bias.name + + self.operatorRepresentation['batch'] = data_in.shape[0] + self.operatorRepresentation['ch_im_in'] = data_in.shape[1] + self.operatorRepresentation['dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['dim_im_in_y'] = data_in.shape[3] + self.operatorRepresentation['ch_im_out'] = grad_out.shape[1] + self.operatorRepresentation['dim_im_out_x'] = grad_out.shape[2] + self.operatorRepresentation['dim_im_out_y'] = grad_out.shape[3] + + self.operatorRepresentation['offset_grad_in_h'] = 0 + self.operatorRepresentation['offset_grad_in_w'] = 0 + self.operatorRepresentation['offset_grad_out_h'] = 0 + self.operatorRepresentation['offset_grad_out_w'] = 0 + + self.operatorRepresentation['gradw_dim_im_in_x'] = data_in.shape[2] + self.operatorRepresentation['gradw_dim_im_in_y'] = data_in.shape[3] + + return ctxt, True diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index e05e897270..9cdb13d96b 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -6,27 +6,27 @@ RemoveEmptyConvBiasPass, RemoveOnlySingletonReduceMeanPass from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer -from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \ - BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \ - BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \ - BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ - BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \ - BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ - BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \ - DummyBinding -from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ - ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ - LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ - ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \ - SoftmaxLayer, SqrtLayer, TransposeLayer +from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, \ + BasicConcatBindings, BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, \ + BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, \ + BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, \ + BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, BasicLayerNormBindings, BasicMatMulBindings, \ + BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \ + BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, \ + BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, \ + BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, DummyBinding +from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, \ + ConvLayer, ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, \ + ITAMaxLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, \ + QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, \ + RQSiGELULayer, SliceLayer, SoftmaxLayer, SqrtLayer, TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \ DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ - IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ - Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, \ + MaxPool1DParser, MulParser, Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, \ + ReluParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, \ + SqrtParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -118,7 +118,7 @@ 'Quant': QuantLayer([QuantMapper]), 'Dequant': DequantLayer([DequantMapper]), 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), - 'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]) + 'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]), # # For example, you can use the DummpyMapper, in case you want to test # # deployment or optimizations with GlobalAveragePool nodes but did not yet # # implement the corresponding kernel diff --git a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py index 9f71012ffe..791ce193ee 100644 --- a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py @@ -2,10 +2,92 @@ # # SPDX-License-Identifier: Apache-2.0 -from .BOPTileConstraint import BOPTileConstraint +from typing import Dict, List, Tuple +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme + + +class MulTileConstraint(TileConstraint): + """Tile constraint for element-wise Mul. + + Handles the case where B is a scalar/broadcast constant: such a constant is + excluded from the tiling solution (no L1 allocation needed), so it is + skipped in the DMA schedule. When B is a full-sized tensor it is tiled + together with A and C. + """ -class MulTileConstraint(BOPTileConstraint): dataIn1Name = "A" dataIn2Name = "B" dataOutName = "C" + + @classmethod + def _B_is_constant(cls, parseDict: Dict, ctxt: NetworkContext) -> bool: + bName = parseDict[cls.dataIn2Name] + return isinstance(ctxt.lookup(bName), ConstantBuffer) + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBuffer1Name = parseDict[cls.dataIn1Name] + outputBufferName = parseDict[cls.dataOutName] + + inputBuffer2Name = parseDict[cls.dataIn2Name] + bIsConstant = cls._B_is_constant(parseDict, ctxt) + + for bufferName in [inputBuffer1Name, inputBuffer2Name, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + if not bIsConstant: + inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == inputDim2Var) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + bInSolution = operatorRepresentation[cls.dataIn2Name] in tilingSolution.tensorMemoryConstraints + addrNames = [cls.dataIn1Name] + ([cls.dataIn2Name] if bInSolution else []) + [cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + for cube in outputCubes: + replacements["size"].append(np.prod(cube.dims)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + entry = {cls.dataIn1Name: cube} + if bInSolution: + entry[cls.dataIn2Name] = cube + inputLoadSchedule.append(entry) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py index c83d8b1e2a..79770fe61e 100644 --- a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py @@ -12,7 +12,8 @@ from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerModel import TilerModel -from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme class TransposeTileConstraint(TileConstraint): @@ -58,7 +59,17 @@ def serializeTilingSolution( invPerm = _invertPermutation(operatorRepresentation['perm']) inputCubes = [] for outCube in outputCubes: - inCube = _permuteHyperRectangle(outCube, invPerm) + # Workaround: MatMulLayer.computeShapes may inject leading batch dims into + # the Transpose output's ctxt shape (e.g. [K,N] → [1,K,N]) when A is 3-D. + # The actual Transpose only operates on the spatial (last len(invPerm)) dims; + # strip the extra leading batch dims before permuting so that inCube matches + # the real data_in (weight) shape which is not broadened. + numExtra = len(outCube.dims) - len(invPerm) + if numExtra > 0: + spatialCube = HyperRectangle(outCube.offset[numExtra:], outCube.dims[numExtra:]) + inCube = _permuteHyperRectangle(spatialCube, invPerm) + else: + inCube = _permuteHyperRectangle(outCube, invPerm) inputCubes.append(inCube) for i, dim in enumerate(inCube.dims): replacements[f"dimLen_{i}"].append(dim) diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py index 146bcf699e..403d144872 100644 --- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py @@ -668,6 +668,10 @@ def _split_transposes_fun(graph: gs.Graph, match: Match, name: str): perm = t1.attrs['perm'] inputVar = t1.inputs[0] + # If the Transpose input has no producer (it is a graph input variable), + # we cannot rewrite the producer's output → skip splitting. + if not t1.inputs[0].inputs: + return graph inputNode = t1.inputs[0].inputs[0] originalNode = t1.outputs[0] diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..e43ea113b4 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -8,7 +8,9 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker -from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, OperatorRepresentation, VariableBuffer class ConcatChecker(SignPropTypeChecker): @@ -202,11 +204,11 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[ def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: - return [True] + return [True] * len(self.output_types) class MulChecker(SignPropTypeChecker): @@ -453,6 +455,41 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] +class MaxPoolGradChecker(SignPropTypeChecker): + """TypeChecker for MaxPoolGrad: two float inputs (grad_output, original_input), one float output (grad_input).""" + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] if inputs[0]._signed else [False] + + def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> bool: + return True + + +class AveragePoolChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + if inputs[0]._signed: + return [True] + else: + return [False] + + class ConvChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): @@ -574,14 +611,35 @@ class SoftmaxCrossEntropyLossChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): super().__init__(input_types, output_types) + def checkOutputType(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> bool: + # The parser sets 'loss' to a non-empty string for 2-output nodes, '' for 1-output. + # Use this to determine the actual output count and match it against this binding. + actual_num_outputs = 2 if operatorRepresentation.get('loss', '') != '' else 1 + return actual_num_outputs == len(self.output_types) + def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return [2**(self.input_types[0].referencedType.typeWidth)] + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: - return [False] + return [False] * len(self.output_types) + + +class MSELossChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: + return [False] * len(self.output_types) class SGDChecker(SignPropTypeChecker): @@ -610,3 +668,200 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + + +class BatchNormInternalChecker(SignPropTypeChecker): + """TypeChecker for ORT BatchNormInternal (training-mode BN forward pass). + + Inputs (5): X, gamma, beta, running_mean, running_var — all float32 + Outputs (5): Y, updated_running_mean, updated_running_var, saved_mean, saved_inv_std + — all float32; outputs[1,2] have no consumers but do have tensor names. + + _inferNumLevels / _inferSignedness return lists of length 5 so that typeInferOutput + (which zips output_types with node.outputs) correctly types all five outputs. + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] * len(self.output_types) + + +class BatchNormalizationGradChecker(SignPropTypeChecker): + """TypeChecker for ORT BatchNormalizationGrad (BN backward pass). + + Inputs (5): dY, X, gamma, saved_mean, saved_inv_std — all float32 + Outputs (3): dX, dgamma, dbeta — all float32 + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] * len(self.output_types) + + +class WelfordReduceChecker(SignPropTypeChecker): + """TypeChecker for WelfordReduce (split BN forward reduction). + + Inputs (1): X [N,C,H,W] + Outputs (2): saved_mean [C], saved_inv_std [C] + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] * len(self.output_types) + + +class ChannelNormalizeChecker(SignPropTypeChecker): + """TypeChecker for ChannelNormalize (split BN forward elementwise). + + Inputs (5): X, saved_mean, saved_inv_std, gamma, beta + Outputs (1): Y [N,C,H,W] + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + +class BNGradReduceChecker(SignPropTypeChecker): + """TypeChecker for BNGradReduce (split BN backward reduction). + + Inputs (4): dY, X, saved_mean, saved_inv_std + Outputs (2): dgamma [C], dbeta [C] + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] * len(self.output_types) + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] * len(self.output_types) + + +class BNGradNormalizeChecker(SignPropTypeChecker): + """TypeChecker for BNGradNormalize (split BN backward elementwise). + + Inputs (7): dY, X, saved_mean, saved_inv_std, gamma, dgamma, dbeta + Outputs (1): dX [N,C,H,W] + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + +class GlobalAveragePoolChecker(SignPropTypeChecker): + """TypeChecker for GlobalAveragePool. + + Input: data_in [N, C, H, W] float32 + Output: data_out [N, C, 1, 1] float32 + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [inputs[0]._signed] + + +class GlobalAveragePoolGradChecker(SignPropTypeChecker): + """TypeChecker for GlobalAveragePoolGrad. + + Input: dY [N, C, 1, 1] float32 + Output: dX [N, C, H, W] float32 + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + +class InPlaceAccumulatorV2Checker(SignPropTypeChecker): + """Type checker for ORT InPlaceAccumulatorV2 operator (com.microsoft). + + Inputs: + 0: buffer (float32*) + 1: gradient (float32*) + 2: lazy_reset_grad (uint8_t* or bool* - 1 element) + + Output: + 0: output_buffer (float32*) + """ + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + # Output has same precision as the buffer input (float32) + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + # Float32 output is signed + return [True] + + +class PULPConvGradBChecker(SignPropTypeChecker): + """TypeChecker for ConvGradB which only has one input (output_grad)""" + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [inputs[0]._signed] diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 5d7b02ae62..4440cc25af 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -17,10 +17,15 @@ from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ - GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ - QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormInternalChecker, \ + BatchNormalizationGradChecker, BNGradNormalizeChecker, BNGradReduceChecker, \ + ChannelNormalizeChecker, ConcatChecker, ConvChecker, DequantChecker, \ + WelfordReduceChecker, \ + GatherChecker, GELUChecker, GEMMChecker, GlobalAveragePoolChecker, GlobalAveragePoolGradChecker, \ + HardswishChecker, InPlaceAccumulatorV2Checker, LayerNormChecker, MatMulChecker, MaxPoolGradChecker, MulChecker, \ + MSELossChecker, QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, \ + SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, \ + PULPConvGradBChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -28,14 +33,17 @@ from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ - FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ - FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ - MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ - RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate -from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ - PULPRequantShiftChecker +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatAveragePoolTemplate, \ + FloatBatchNormTemplate, FloatConvGradTemplate, FloatConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \ + FloatGlobalAveragePoolTemplate, \ + FloatInPlaceAccumulatorV2Template, FloatLayernormTemplate, FloatMatMulTemplate, \ + FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \ + GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MSELossTemplate, MulTemplate, ReduceMeanTemplate, \ + RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ + SoftmaxCrossEntropyLossTemplate, \ + TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate +from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, \ + PULPMaxPoolChecker, PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ TilingVariableReplacementUpdate @@ -69,10 +77,6 @@ closureCallTemplate = _clusterForkClosureCallTemplate) TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure") -FunctionCallClosure = partial(ClosureGeneration, closureSuffix = "_closure") -ForkClosure = partial(ClosureGeneration, - closureSuffix = "_cluster_fork", - closureCallTemplate = _clusterForkClosureCallTemplate) MemoryAwareClusterClosure = partial(MemoryAwareClosureGeneration, closureSuffix = "_cluster_entry", @@ -189,6 +193,11 @@ ] + [ NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), FloatAddTemplate.referenceTemplate, ForkTransformer) +] + [ + NodeBinding( + AddChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatAddTemplate.referenceTemplate, + ForkTransformer) ] PULPRQSConv2DBindings = [ @@ -238,6 +247,20 @@ ForkTransformer) ] +PULPFloatConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradW2DIm2ColTemplate, + ClusterTransformer) +] + +PULPFloatConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradX2DIm2ColTiledTemplate, + ForkTransformer) +] + PULPFloatDWConv2DBindings = [ NodeBinding( ConvChecker( @@ -246,6 +269,96 @@ ForkTransformer) for float_type in FloatDataTypes ] +PULPFloatDWConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceDWConvGradX2DTiledTemplate, + ForkTransformer) +] + +PULPFloatDWConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceDWConvGradW2DTemplate, + ClusterTransformer) +] + +PULPFloatPWConvGradW2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referencePWConvGradW2DTemplate, + ClusterTransformer) +] + +PULPFloatPWConvGradX2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referencePWConvGradX2DTemplate, + ClusterTransformer) +] + +PULPFloatConvGradBBindings = [ + NodeBinding( + PULPConvGradBChecker([PointerClass(float32_t)], + [PointerClass(float32_t)]), FloatConvGradTemplate.referenceConvGradB2DTemplate, + ClusterTransformer) +] + +# 5 inputs (X, gamma, beta, running_mean, running_var), 5 outputs (Y, urm, urv, saved_mean, saved_inv_std) +PULPBatchNormInternalBindings = [ + NodeBinding( + BatchNormInternalChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 5), FloatBatchNormTemplate.batchNormInternalTemplate, + ForkTransformer) +] + +# 5 inputs (dY, X, gamma, saved_mean, saved_inv_std), 3 outputs (dX, dgamma, dbeta) +PULPBatchNormalizationGradBindings = [ + NodeBinding( + BatchNormalizationGradChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 3), FloatBatchNormTemplate.batchNormGradTemplate, + ForkTransformer) +] + + +# Split BN forward: WelfordReduce (1 input X, 2 outputs saved_mean, saved_inv_std) +PULPWelfordReduceBindings = [ + NodeBinding( + WelfordReduceChecker( + [PointerClass(float32_t)] * 1, + [PointerClass(float32_t)] * 2), FloatBatchNormTemplate.welfordReduceTemplate, + ForkTransformer) +] + +# Split BN forward: ChannelNormalize (5 inputs, 1 output) +PULPChannelNormalizeBindings = [ + NodeBinding( + ChannelNormalizeChecker( + [PointerClass(float32_t)] * 5, + [PointerClass(float32_t)] * 1), FloatBatchNormTemplate.channelNormalizeTemplate, + ForkTransformer) +] + +# Split BN backward: BNGradReduce (4 inputs, 2 outputs) +PULPBNGradReduceBindings = [ + NodeBinding( + BNGradReduceChecker( + [PointerClass(float32_t)] * 4, + [PointerClass(float32_t)] * 2), FloatBatchNormTemplate.bnGradReduceTemplate, + ForkTransformer) +] + +# Split BN backward: BNGradNormalize (7 inputs, 1 output) +PULPBNGradNormalizeBindings = [ + NodeBinding( + BNGradNormalizeChecker( + [PointerClass(float32_t)] * 7, + [PointerClass(float32_t)] * 1), FloatBatchNormTemplate.bnGradNormalizeTemplate, + ForkTransformer) +] + PULPRQSMatrixVecBindings = [ NodeBinding( PULPLinearChecker([PointerClass(type1), @@ -279,6 +392,72 @@ FloatMaxPoolTemplate.referenceTemplate, ForkTransformer) ] +PULPAveragePool2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceTemplate, ForkTransformer) +] + +PULPAveragePoolGrad2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceGradTemplate, ForkTransformer) +] + +# 1 input (data_in [N,C,H,W]), 1 output (data_out [N,C,1,1]) +PULPGlobalAveragePool2DBindings = [ + NodeBinding( + GlobalAveragePoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolTemplate, + ForkTransformer) +] + +# 1 input (dY [N,C,1,1]), 1 output (dX [N,C,H,W]) +PULPGlobalAveragePoolGrad2DBindings = [ + NodeBinding( + GlobalAveragePoolGradChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolGradTemplate, + ForkTransformer) +] + +PULPMaxPoolGrad2DBindings = [ + NodeBinding( + MaxPoolGradChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMaxPoolTemplate.referenceGradTemplate, ForkTransformer) +] + + +PULPAveragePool2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceTemplate, ForkTransformer) +] + +PULPAveragePoolGrad2DBindings = [ + NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAveragePoolTemplate.referenceGradTemplate, ForkTransformer) +] + +# 1 input (data_in [N,C,H,W]), 1 output (data_out [N,C,1,1]) +PULPGlobalAveragePool2DBindings = [ + NodeBinding( + GlobalAveragePoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolTemplate, + ForkTransformer) +] + +# 1 input (dY [N,C,1,1]), 1 output (dX [N,C,H,W]) +PULPGlobalAveragePoolGrad2DBindings = [ + NodeBinding( + GlobalAveragePoolGradChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatGlobalAveragePoolTemplate.globalAveragePoolGradTemplate, + ForkTransformer) +] + +PULPMaxPoolGrad2DBindings = [ + NodeBinding( + MaxPoolGradChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMaxPoolTemplate.referenceGradTemplate, ForkTransformer) +] + + PULPRQSConv1DBindings = [ NodeBinding( PULPConvChecker([PointerClass(_type), @@ -357,17 +536,49 @@ SoftmaxCrossEntropyLossTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes ] +# Dual-output binding: outputs[0]=loss (scalar), outputs[1]=log_prob +PULPSoftmaxCrossEntropyLossDualOutputBindings = [ + NodeBinding( + SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], + [PointerClass(float32_t), PointerClass(float32_t)]), + SoftmaxCrossEntropyLossTemplate.referenceDualOutputTemplate, ForkTransformer) for type in IntegerDataTypes +] + PULPSoftmaxCrossEntropyLossGradBindings = [ NodeBinding( SoftmaxCrossEntropyLossChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), SoftmaxCrossEntropyLossTemplate.referenceGradientTemplate, ForkTransformer) for type in IntegerDataTypes ] +PULPMSELossBindings = [ + NodeBinding(MSELossChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MSELossTemplate.referenceTemplate, ForkTransformer) +] + +PULPMSELossGradBindings = [ + NodeBinding(MSELossChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MSELossTemplate.referenceGradientTemplate, ForkTransformer) +] + PULPSGDBindings = [ NodeBinding(SGDChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), SGDTemplate.referenceTemplate, ForkTransformer) ] +PULPInPlaceAccumulatorV2Bindings = [ + NodeBinding( + InPlaceAccumulatorV2Checker( + [PointerClass(float32_t), PointerClass(float32_t), PointerClass(uint8_t)], [PointerClass(float32_t)]), + FloatInPlaceAccumulatorV2Template.referenceTemplate, ForkTransformer) +] + +PULPInPlaceAccumulatorV2TiledBindings = [ + NodeBinding( + InPlaceAccumulatorV2Checker( + [PointerClass(float32_t), PointerClass(float32_t), PointerClass(uint8_t)], [PointerClass(float32_t)]), + FloatInPlaceAccumulatorV2Template.tiledReferenceTemplate, ForkTransformer) +] + PULPTransposeBindings = [ NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes @@ -379,6 +590,9 @@ PULPConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float_type), PointerClass(float_type)], [PointerClass(float_type)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) for float_type in FloatDataTypes ] PULPiRMSNormBindings = [ @@ -420,18 +634,33 @@ PULPReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatReluTemplate.referenceTemplate, ForkTransformer) +PULPReluGradBinding = NodeBinding( + ReluChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReluTemplate.referenceGradTemplate, ForkTransformer) + PULPLayernormBinding = NodeBinding( LayerNormChecker( + # inputs: data_in (X), weight (scale/gamma), bias (beta) [PointerClass(float32_t), PointerClass(float32_t), - PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate, + PointerClass(float32_t)], + # outputs: data_out (Y), mean stash, inv_std_dev stash + [PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate, ForkTransformer) PULPLayernormGradBinding = NodeBinding( LayerNormChecker( + # inputs: grad_in (dY), data_in (X), weight (scale/gamma), + # mean stash, inv_std_dev stash [PointerClass(float32_t), PointerClass(float32_t), PointerClass(float32_t), - PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceGradTemplate, + PointerClass(float32_t), + PointerClass(float32_t)], + # outputs: grad_out (dX), weight_grad (dscale), bias_grad (dbias) + [PointerClass(float32_t), + PointerClass(float32_t), + PointerClass(float32_t)]), FloatLayernormTemplate.referenceGradTemplate, ForkTransformer) PULPFloatGELUBinding = NodeBinding( @@ -459,3 +688,4 @@ NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, ForkTransformer), ] + diff --git a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py index 93bf699dc6..87f973d3e9 100644 --- a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py +++ b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py @@ -3,9 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Dict, Tuple +from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer +from Deeploy.DeeployTypes import CodeSnippet, NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future @@ -50,6 +50,8 @@ def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, lo assert strideLoc[0] == shape[1] and strideLoc[ 1] == 1, "Mchan supports only contigous transfers for local memory" + _MAX_1D_TRANSFER_BYTES = 1 << 17 # 131072 bytes: max representable in 17-bit mchan cmd size field + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, future: Future) -> OperatorRepresentation: @@ -65,10 +67,10 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu mchanFlags += (1 << 3) # event enable mchanTransferSize = math.prod(shape) - mchanTransferSizeBits = math.ceil(math.log2(mchanTransferSize)) - assert mchanTransferSizeBits <= 17, ( + assert mchanTransferSize <= self._MAX_1D_TRANSFER_BYTES, ( "The transfer size is not representable with 17 bits. " - f"Received transfer size {mchanTransferSize} that requires {mchanTransferSizeBits}") + f"Received transfer size {mchanTransferSize} that requires " + f"{math.ceil(math.log2(mchanTransferSize))} bits") operatorRepresentation["cmd"] = (mchanFlags << 17) + mchanTransferSize @@ -77,3 +79,30 @@ def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBu operatorRepresentation["stride_2d"] = strideExt[0] return operatorRepresentation + + def transfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, + shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], + direction: DmaDirection, future: Future) -> List[CodeSnippet]: + # For 1D transfers that exceed the 17-bit mchan limit, split into chunks. + totalSize = math.prod(shape) + if len(shape) == 1 and totalSize > self._MAX_1D_TRANSFER_BYTES: + mchanFlags = 0 + mchanFlags += (1 << 0) if direction == "ExternalToLocal" else 0 + mchanFlags += (1 << 1) # increment addresses + mchanFlags += (1 << 3) # event enable + template = self._transferTemplates[1] + chunks: List[CodeSnippet] = [] + offset = 0 + while offset < totalSize: + chunkSize = min(self._MAX_1D_TRANSFER_BYTES, totalSize - offset) + cmd = (mchanFlags << 17) + chunkSize + opRepr: OperatorRepresentation = { + "loc": f"((char*){localBuffer.name} + {offset})", + "ext": f"((char*){externalBuffer.name} + {offset})", + "future": future.name, + "cmd": cmd, + } + chunks.append(CodeSnippet(template, opRepr)) + offset += chunkSize + return chunks + return super().transfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future) diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index 5c5951eaba..73666d57ee 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -4,12 +4,14 @@ import math from typing import Tuple +import numpy as np import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, \ - RQSConv2DParser, RQSParserInterface +from Deeploy.Targets.Generic.Parsers import Conv2DGradWParser, Conv2DGradXParser, Conv2DGradXWParser, \ + Conv2DGradXWBParser, Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, RQSConv2DParser, \ + RQSParserInterface class PULPConv2DParser(RQSConv2DParser): @@ -75,10 +77,10 @@ def parseNode(self, node: gs.Node) -> (bool): # Current PULP kernel only supports grouping of 1 self.operatorRepresentation['group'] == 1, - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias @@ -133,10 +135,10 @@ def parseNode(self, node: gs.Node) -> (bool): if wellFormed: # Check if the node is a depthwise convolution ret = all([ - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias @@ -485,3 +487,173 @@ def parseNodeCtxt(self, return newCtxt, True else: return ctxt, False + + +class PULPConvGradX2DParser(Conv2DGradXParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return newCtxt, True + + return ctxt, False + + +class PULPDWConvGradX2DParser(PULPConvGradX2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + ret = all([ + self.operatorRepresentation['group'] > 1, + ]) + + return wellFormed + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if self.operatorRepresentation['group'] == ctxt.lookup(self.operatorRepresentation['weight']).shape[0]: + return newCtxt, True + + return ctxt, False + + +class PULPPWConvGradX2DParser(PULPConvGradX2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + kernel_shape = self.operatorRepresentation['kernel_shape'] + if kernel_shape != [1, 1]: + return False + + return wellFormed and True + + +class PULPConvGradW2DParser(Conv2DGradWParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return newCtxt, True + + return ctxt, False + + +class PULPDWConvGradW2DParser(PULPConvGradW2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + ret = all([ + self.operatorRepresentation['group'] > 1, + ]) + + return wellFormed and ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if self.operatorRepresentation['group'] == ctxt.lookup( + self.operatorRepresentation['grad_weight']).shape[0]: + return newCtxt, True + + return ctxt, False + + +class PULPPWConvGradW2DParser(PULPConvGradW2DParser): + + def __init__(self, noBiasHoisting=True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> bool: + + wellFormed = super().parseNode(node) + + kernel_shape = self.operatorRepresentation['kernel_shape'] + + if kernel_shape != [1, 1]: + return False + + return wellFormed and True + + +class PULPConvGradXW2DParser(Conv2DGradXWParser): + """PULP-specific combined ConvGrad no-bias (3 inputs, 2 outputs).""" + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, ctxt, node, channels_first=True): + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if ret: + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + return newCtxt, True + return ctxt, False + + +class PULPConvGradXWB2DParser(Conv2DGradXWBParser): + """PULP-specific combined ConvGrad with bias (4 inputs, 3 outputs).""" + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, ctxt, node, channels_first=True): + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if ret: + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + return newCtxt, True + return ctxt, False diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 7456dd9e1b..32d45a1d4c 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -13,18 +13,31 @@ from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \ - GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ - ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ - RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \ - MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \ - SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \ - SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \ - iSoftmaxParser +from Deeploy.Targets.Generic.Layers import AddLayer, AveragePoolGradLayer, AveragePoolLayer, \ + BatchNormInternalLayer, BatchNormalizationGradLayer, BNGradNormalizeLayer, BNGradReduceLayer, \ + ChannelNormalizeLayer, ConcatLayer, ConvLayer, \ + GlobalAveragePoolLayer, GlobalAveragePoolGradLayer, \ + ConvGradBLayer, ConvGradWLayer, ConvGradXLayer, GatherLayer, GELUGradLayer, GELULayer, GEMMLayer, \ + InPlaceAccumulatorV2Layer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, \ + WelfordReduceLayer, \ + MaxPoolGradLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluGradLayer, \ + MSELossGradLayer, MSELossLayer, \ + ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ + SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ + TransposeLayer, iHardswishLayer, iRMSNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, AveragePool2DParser, BatchNormInternalParser, \ + BatchNormalizationGradParser, BNGradNormalizeParser, BNGradReduceParser, \ + ChannelNormalizeParser, ConcatParser, Conv2DGradBParser, \ + WelfordReduceParser, \ + GlobalAveragePoolParser, GlobalAveragePoolGradParser, \ + DequantParser, FlattenParser, GatherParser, GELUGradParser, GELUParser, GEMMParser, \ + InPlaceAccumulatorV2Parser, LayerNormGradParser, LayerNormParser, MatMulParser, \ + MaxPool1DParser, MaxPool2DParser, MaxPoolGradParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, \ + ReluGradParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \ + MSELossGradParser, MSELossParser, \ + RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \ + SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \ + iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -32,32 +45,46 @@ from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \ PULPDWConv1DBinding from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer -from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ - PULPReduceMeanParser, PULPTallGEMMParser +from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPConvGradW2DParser, \ + PULPConvGradX2DParser, PULPDWConv1DParser, PULPDWConv2DParser, PULPDWConvGradW2DParser, \ + PULPDWConvGradX2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ + PULPPWConvGradW2DParser, PULPPWConvGradX2DParser, PULPReduceMeanParser, PULPTallGEMMParser from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ - PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ +from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPAveragePool2DTilingReadyBindings, \ + PULPAveragePoolGrad2DTilingReadyBindings, PULPConcatTilingReadyBindings, PULPConv2DTilingReadyBindings, \ + PULPConvGradW2DTilingReadyBindings, PULPConvGradX2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, \ + PULPDWConvGradW2DTilingReadyBindings, PULPDWConvGradX2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \ - PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \ - PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \ - PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \ + PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPInPlaceAccumulatorV2TilingReadyBindings, \ + PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, \ + PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, \ + PULPMaxPool2DTilingReadyBindings, \ + PULPMulTilingReadyBindings, PULPPWConvGradW2DTilingReadyBindings, PULPPWConvGradX2DTilingReadyBindings, \ + PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluGradTilingReadyBindings, \ PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \ - PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \ - PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \ - PULPUniformRQSTilingReadyBindings + PULPRQSConv2DTilingReadyBindings, \ + PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \ + PULPMaxPoolGrad2DTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \ + PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \ + PULPBatchNormInternalTilingReadyBindings, PULPBatchNormalizationGradTilingReadyBindings, \ + PULPWelfordReduceTilingReadyBindings, PULPChannelNormalizeTilingReadyBindings, \ + PULPBNGradReduceTilingReadyBindings, PULPBNGradNormalizeTilingReadyBindings, \ + PULPGlobalAveragePool2DTilingReadyBindings, PULPGlobalAveragePoolGrad2DTilingReadyBindings, \ + PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ + PULPSoftmaxCrossEntropyLossDualOutputTilingReadyBindings, \ + PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, \ + PULPMSELossTilingReadyBindings, PULPMSELossGradTilingReadyBindings, \ + PULPConvGradBTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass +from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.SplitConvGradPass import SplitConvGradPass +# ── Inference NodeMappers ────────────────────────────────────────────────── RQAddMapper = NodeMapper(RQAddParser(), PULPRQAddTilingReadyBindings) AddMapper = NodeMapper(AddParser(), PULPAddTilingReadyBindings) FlattenMapper = NodeMapper(FlattenParser(), PULPFlattenTilingReadyBindings) GELUMapper = NodeMapper(GELUParser(), PULPFPGELUTilingReadyBindings) -GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings) GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings) MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) @@ -88,39 +115,62 @@ MaxPool1DMapper = NodeMapper(MaxPool1DParser(), PULPMaxPool1DTilingReadyBindings) MaxPool2DMapper = NodeMapper(MaxPool2DParser(), PULPMaxPool2DTilingReadyBindings) LayerNormMapper = NodeMapper(LayerNormParser(), PULPLayernormTilingReadyBindings) -LayerNormGradMapper = NodeMapper(LayerNormGradParser(), PULPLayernormGradTilingReadyBindings) ReluMapper = NodeMapper(ReluParser(), PULPReluTilingReadyBindings) SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings) -SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings) Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), PULPSoftmaxTilingReadyBindings) - ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings) - DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) - SliceMapper = NodeMapper(SliceParser(), PULPSliceTilingReadyBindings) - iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings) - iHardswishMapper = NodeMapper(iHardswishParser(), PULPiHardswishTilingReadyBindings) RQSiHardswishMapper = NodeMapper(RQSiHardswishParser(), PULPRQSiHardswishTilingReadyBindings) +QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) +DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) +GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +AveragePool2DMapper = NodeMapper(AveragePool2DParser(), PULPAveragePool2DTilingReadyBindings) +# ── Training / Gradient NodeMappers ─────────────────────────────────────── +GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings) +ConvGradXMapper = NodeMapper(PULPConvGradX2DParser(), PULPConvGradX2DTilingReadyBindings) +DwConvGradxMapper = NodeMapper(PULPDWConvGradX2DParser(), PULPDWConvGradX2DTilingReadyBindings) +PWConvGradX2DMapper = NodeMapper(PULPPWConvGradX2DParser(), PULPPWConvGradX2DTilingReadyBindings) +ConvGradWMapper = NodeMapper(PULPConvGradW2DParser(), PULPConvGradW2DTilingReadyBindings) +DwConvGradWMapper = NodeMapper(PULPDWConvGradW2DParser(), PULPDWConvGradW2DTilingReadyBindings) +PWConvGradW2DMapper = NodeMapper(PULPPWConvGradW2DParser(), PULPPWConvGradW2DTilingReadyBindings) +LayerNormGradMapper = NodeMapper(LayerNormGradParser(), PULPLayernormGradTilingReadyBindings) +AveragePoolGrad2DMapper = NodeMapper(AveragePool2DParser(), PULPAveragePoolGrad2DTilingReadyBindings) +MaxPoolGrad2DMapper = NodeMapper(MaxPoolGradParser(), PULPMaxPoolGrad2DTilingReadyBindings) +ReluGradMapper = NodeMapper(ReluGradParser(), PULPReluGradTilingReadyBindings) +SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings) SoftmaxCrossEntropyLossMapper = NodeMapper(SoftmaxCrossEntropyLossParser(), PULPSoftmaxCrossEntropyTilingReadyBindings) +# Dual-output mapper (loss + log_prob): tried first; falls back to single-output mapper for 1-output nodes +SoftmaxCrossEntropyLossDualOutputMapper = NodeMapper(SoftmaxCrossEntropyLossParser(), + PULPSoftmaxCrossEntropyLossDualOutputTilingReadyBindings) SoftmaxCrossEntropyLossGradMapper = NodeMapper(SoftmaxCrossEntropyLossGradParser(), PULPSoftmaxCrossEntropyGradTilingReadyBindings) SGDMapper = NodeMapper(SGDParser(), PULPSGDTilingReadyBindings) -QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) -DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) -GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +InPlaceAccumulatorV2Mapper = NodeMapper(InPlaceAccumulatorV2Parser(), PULPInPlaceAccumulatorV2TilingReadyBindings) +ConvGradBMapper = NodeMapper(Conv2DGradBParser(), PULPConvGradBTilingReadyBindings) +MSELossMapper = NodeMapper(MSELossParser(), PULPMSELossTilingReadyBindings) +MSELossGradMapper = NodeMapper(MSELossGradParser(), PULPMSELossGradTilingReadyBindings) +BatchNormInternalMapper = NodeMapper(BatchNormInternalParser(), PULPBatchNormInternalTilingReadyBindings) +BatchNormalizationGradMapper = NodeMapper(BatchNormalizationGradParser(), PULPBatchNormalizationGradTilingReadyBindings) +WelfordReduceMapper = NodeMapper(WelfordReduceParser(), PULPWelfordReduceTilingReadyBindings) +ChannelNormalizeMapper = NodeMapper(ChannelNormalizeParser(), PULPChannelNormalizeTilingReadyBindings) +BNGradReduceMapper = NodeMapper(BNGradReduceParser(), PULPBNGradReduceTilingReadyBindings) +BNGradNormalizeMapper = NodeMapper(BNGradNormalizeParser(), PULPBNGradNormalizeTilingReadyBindings) +GlobalAveragePoolMapper = NodeMapper(GlobalAveragePoolParser(), PULPGlobalAveragePool2DTilingReadyBindings) +GlobalAveragePoolGradMapper = NodeMapper(GlobalAveragePoolGradParser(), PULPGlobalAveragePoolGrad2DTilingReadyBindings) + PULPMapping = { + # ── Inference operators ─────────────────────────────────────────────── 'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]), 'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]), 'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]), 'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]), 'Gelu': GELULayer([GELUMapper]), - 'GeluGrad': GELUGradLayer([GELUGradMapper]), 'LayerNormalization': LayerNormLayer([LayerNormMapper]), - 'LayerNormalizationGrad': LayerNormGradLayer([LayerNormGradMapper]), - 'MaxPool': MaxPoolLayer([MaxPool1DMapper, MaxPool2DMapper]), + 'MaxPool': MaxPoolLayer([MaxPool2DMapper]), + 'AveragePool': AveragePoolLayer([AveragePool2DMapper]), 'RequantizediGELU': RQSiGELULayer([RQGELU_int8_Mapper]), 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), 'MatMul': MatMulLayer([MatMulMapper]), @@ -148,10 +198,31 @@ 'RequantizediHardswish': RQSiHardswishLayer([RQSiHardswishMapper]), 'Quant': QuantLayer([QuantMapper]), 'Dequant': QuantLayer([DequantMapper]), + # ── Training / Gradient operators ───────────────────────────────────── + 'ConvGradX': ConvGradXLayer([PWConvGradX2DMapper, DwConvGradxMapper, ConvGradXMapper]), + 'ConvGradW': ConvGradWLayer([PWConvGradW2DMapper, DwConvGradWMapper, ConvGradWMapper]), + 'ConvGradB': ConvGradBLayer([ConvGradBMapper]), + 'GeluGrad': GELUGradLayer([GELUGradMapper]), + 'LayerNormalizationGrad': LayerNormGradLayer([LayerNormGradMapper]), + 'AveragePoolGrad': AveragePoolGradLayer([AveragePoolGrad2DMapper]), + 'MaxPoolGrad': MaxPoolGradLayer([MaxPoolGrad2DMapper]), + 'ReluGrad': ReluGradLayer([ReluGradMapper]), 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), - 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), + 'MSELoss': MSELossLayer([MSELossMapper]), + 'MSELossGrad': MSELossGradLayer([MSELossGradMapper]), + 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossDualOutputMapper, SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), - 'SGD': SGDLayer([SGDMapper]) + 'SGD': SGDLayer([SGDMapper]), + 'InPlaceAccumulatorV2': InPlaceAccumulatorV2Layer([InPlaceAccumulatorV2Mapper]), + 'BatchNormInternal': BatchNormInternalLayer([BatchNormInternalMapper]), + 'BatchNormalizationGrad': BatchNormalizationGradLayer([BatchNormalizationGradMapper]), + 'GlobalAveragePool': GlobalAveragePoolLayer([GlobalAveragePoolMapper]), + 'GlobalAveragePoolGrad': GlobalAveragePoolGradLayer([GlobalAveragePoolGradMapper]), + # Split BN ops (for spatial tiling) + 'WelfordReduce': WelfordReduceLayer([WelfordReduceMapper]), + 'ChannelNormalize': ChannelNormalizeLayer([ChannelNormalizeMapper]), + 'BNGradReduce': BNGradReduceLayer([BNGradReduceMapper]), + 'BNGradNormalize': BNGradNormalizeLayer([BNGradNormalizeMapper]), } @@ -225,6 +296,7 @@ class PULPStructBuffer(StructBuffer): PULPOptimizer = TopologyOptimizer([ + SplitConvGradPass(), QuantPatternPass(), DequantPatternPass(), SkipEmptyConcatPass(), diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAveragePoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAveragePoolTemplate.py new file mode 100644 index 0000000000..ab396c5321 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatAveragePoolTemplate.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// 2D Float AveragePool Channel Parallel (Name: ${nodeName}, Op: ${nodeOp}) + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_AvgPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, + ${dim_kernel_y}, ${dim_kernel_x}, + ${stride_y}, ${stride_x}, + ref_${data_out}_${data_out}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; +} +""") + +referenceCHWTemplate = NodeTemplate(""" +// 2D Float AveragePool CHW (Name: ${nodeName}, Op: ${nodeOp}) + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_AvgPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_CHW( + ref_${data_out}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${data_out}_${data_out}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; +} +""") + +referenceGradTemplate = NodeTemplate(""" +// 2D Float AveragePoolGrad Channel Parallel (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + PULP_AvgPoolGrad2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_im_out_x}, ${dim_im_out_y}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${data_out}_${data_out}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + + ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatBatchNormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatBatchNormTemplate.py new file mode 100644 index 0000000000..0656bc85e3 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatBatchNormTemplate.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Forward pass (training mode): BatchNormInternal +# Inputs: X, gamma, beta, running_mean, running_var +# Outputs: Y, saved_mean, saved_inv_std (updated_running_mean/var have no consumers) +batchNormInternalTemplate = NodeTemplate(""" +// BatchNormInternal (Name: ${nodeName}, Op: ${nodeOp}) +PULP_BatchNormInternal_fp32( + ${data_in}, + ${scale}, + ${bias}, + ${running_mean}, + ${running_var}, + ${data_out}, + ${saved_mean}, + ${saved_inv_std}, + ${N}, + ${C}, + ${H_in}, + ${W_in}, + ${epsilon}f, + ${momentum}f +); +""") + +# Backward pass: BatchNormalizationGrad +# Inputs: dY, X, gamma, saved_mean, saved_inv_std +# Outputs: dX, dgamma, dbeta +batchNormGradTemplate = NodeTemplate(""" +// BatchNormalizationGrad (Name: ${nodeName}, Op: ${nodeOp}) +PULP_BatchNormGrad_fp32( + ${dY}, + ${X}, + ${gamma}, + ${saved_mean}, + ${saved_inv_std}, + ${dX}, + ${dgamma}, + ${dbeta}, + ${N}, + ${C}, + ${H_in}, + ${W_in}, + ${epsilon}f +); +""") + +# Split BN forward: WelfordReduce +welfordReduceTemplate = NodeTemplate(""" +// WelfordReduce (Name: ${nodeName}, Op: ${nodeOp}) +PULP_WelfordReduce_fp32( + ${data_in}, + ${saved_mean}, + ${saved_inv_std}, + ${N}, + ${C}, + ${H_in}, + ${W_in}, + ${epsilon}f +); +""") + +# Split BN forward: ChannelNormalize +channelNormalizeTemplate = NodeTemplate(""" +// ChannelNormalize (Name: ${nodeName}, Op: ${nodeOp}) +PULP_ChannelNormalize_fp32( + ${data_in}, + ${saved_mean}, + ${saved_inv_std}, + ${gamma}, + ${beta}, + ${data_out}, + ${N}, + ${C}, + ${H_in}, + ${W_in} +); +""") + +# Split BN backward: BNGradReduce +bnGradReduceTemplate = NodeTemplate(""" +// BNGradReduce (Name: ${nodeName}, Op: ${nodeOp}) +PULP_BNGradReduce_fp32( + ${dY}, + ${X}, + ${saved_mean}, + ${saved_inv_std}, + ${dgamma}, + ${dbeta}, + ${N}, + ${C}, + ${H_in}, + ${W_in} +); +""") + +# Split BN backward: BNGradNormalize +bnGradNormalizeTemplate = NodeTemplate(""" +// BNGradNormalize (Name: ${nodeName}, Op: ${nodeOp}) +PULP_BNGradNormalize_fp32( + ${dY}, + ${X}, + ${saved_mean}, + ${saved_inv_std}, + ${gamma}, + ${dgamma}, + ${dbeta}, + ${dX}, + ${N}, + ${C}, + ${H_in}, + ${W_in}, + ${N_total_inv}f +); +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py new file mode 100644 index 0000000000..061f4a159e --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvGradTemplate.py @@ -0,0 +1,405 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + +_TILE_IDX_NULL = "NULL" + + +class _ConvGradWTemplate(NodeTemplate): + """NodeTemplate subclass for ConvGradW operators. + + Injects tileIdxPtr='NULL' sentinel via alignToContext so the template + always has a defined tileIdxPtr value, avoiding Mako strict_undefined + eager-initialization NameError. The tiling pass overwrites 'NULL' with + the real buffer name when multi-tile execution is required. + """ + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + if 'tileIdxPtr' not in operatorRepresentation: + operatorRepresentation['tileIdxPtr'] = _TILE_IDX_NULL + return ctxt, operatorRepresentation, [] + + +class PULP2DFloatConvGradWIm2ColTemplate(_ConvGradWTemplate): + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # For ConvGradW, im2col buffer stores im2row transformed input + # Size: H_out * W_out * kernel_h * kernel_w * C_in * sizeof(float) + im2col_dim = (operatorRepresentation["data_in_type"].typeWidth // 8) * \ + operatorRepresentation['dim_im_out_x'] * operatorRepresentation['dim_im_out_y'] * \ + operatorRepresentation['ch_im_in'] * \ + operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = PULP2DFloatConvGradWIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + +class PULP2DFloatConvGradXIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + im2col_dim = (operatorRepresentation["grad_in_type"].typeWidth // 8) * \ + operatorRepresentation['dim_im_out_x'] * operatorRepresentation['dim_im_out_y'] * \ + operatorRepresentation['ch_im_out'] * \ + operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + + im2col_name = operatorRepresentation['nodeName'] + "_im2col_buffer" + + bt_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * \ + operatorRepresentation['ch_im_in'] * operatorRepresentation['ch_im_out'] * \ + operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + + bt_name = operatorRepresentation['nodeName'] + "_bt_buffer" + + return [(im2col_name, im2col_dim), (bt_name, bt_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + buffers = PULP2DFloatConvGradXIm2ColTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation) + + im2col_name, im2col_dim = buffers[0] + bt_name, bt_dim = buffers[1] + + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + ctxt.hoistTransientBuffer(bt_name, bt_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + operatorRepresentation['btBuffer'] = bt_name + operatorRepresentation['btBufferSize'] = bt_dim + + return ctxt, operatorRepresentation, [im2col_name, bt_name] + + +# Templates for ConvGradX operations +referenceConvGradX2DTemplate = NodeTemplate(""" +// 2D FP ConvGradX (dX) NCHW trainlib naive (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_out} = ${grad_out}; // dY +${weight_type.typeName} ref_${weight} = ${weight}; // W +${grad_in_type.typeName} ref_${grad_in} = ${grad_in}; // dX + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW_tiled( + ref_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${weight}, + ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_in}, + ${dim_im_in_x}, ${dim_im_in_y}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${offset_grad_in_h}, ${offset_grad_in_w}, + ${offset_grad_out_h}, ${offset_grad_out_w} + + ); + + ref_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") + + +referenceConvGradX2DIm2ColTiledTemplate = PULP2DFloatConvGradXIm2ColTemplate(""" +// 2D FP ConvGradX (dX) NCHW/CHW using tile-aware Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_out} = ${grad_out}; // dY +${weight_type.typeName} ref_${weight} = ${weight}; // W +${grad_in_type.typeName} ref_${grad_in} = ${grad_in}; // dX +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW_Im2Col_tiled( + ref_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, // dY tile dims + ref_${weight}, + ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_in}, + ${dim_im_in_x}, ${dim_im_in_y}, // dX tile dims + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${offset_grad_in_h}, ${offset_grad_in_w}, + ${offset_grad_out_h}, ${offset_grad_out_w}, + ${ctxtBuffer}, ${ctxtBufferSize}, + ${btBuffer}, ${btBufferSize} + ); + + ref_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""" +) + + +# Templates for ConvGradW operations +referenceConvGradW2DTemplate = _ConvGradWTemplate(""" +// 2D FP ConvGradW NCHW using pulp-trainlib naive (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out}; +${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in}; +${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight}; + +% if tileIdxPtr != 'NULL': +{ + static uint32_t ${nodeName}_last_step = 0xFFFFFFFFu; + if ((uint32_t)*${tileIdxPtr} != ${nodeName}_last_step) { + memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName})); + ${nodeName}_last_step = (uint32_t)*${tileIdxPtr}; + } +} +% else: +memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName})); +% endif + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW( + ref_${grad_weight}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_weight}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_weight}_out, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + + ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") + +referenceConvGradW2DIm2ColTemplate = PULP2DFloatConvGradWIm2ColTemplate(""" +// 2D FP ConvGradW NCHW using pulp-trainlib Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out}; +${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in}; +${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight}; + +% if tileIdxPtr != 'NULL': +{ + static uint32_t ${nodeName}_last_step = 0xFFFFFFFFu; + if ((uint32_t)*${tileIdxPtr} != ${nodeName}_last_step) { + memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName})); + ${nodeName}_last_step = (uint32_t)*${tileIdxPtr}; + } +} +% else: +memset(${grad_weight}, 0, (${ch_im_out} * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}) * sizeof(${grad_weight_type.referencedType.typeName})); +% endif + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW_Im2Col( + ref_${grad_weight}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_weight}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_weight}_out, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${ctxtBuffer}, ${ctxtBufferSize} + ); + + ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") + +# ============================================================================ +# Depthwise Convolution Gradient Templates +# ============================================================================ + + +referenceDWConvGradW2DTemplate = _ConvGradWTemplate(""" +// 2D FP DW ConvGradW NCHW (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out}; +${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in}; +${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight}; + +% if tileIdxPtr != 'NULL': +{ + static uint32_t ${nodeName}_last_step = 0xFFFFFFFFu; + if ((uint32_t)*${tileIdxPtr} != ${nodeName}_last_step) { + memset(${grad_weight}, 0, ${ch_im_out} * ${dim_kernel_x} * ${dim_kernel_y} * sizeof(${grad_weight_type.referencedType.typeName})); + ${nodeName}_last_step = (uint32_t)*${tileIdxPtr}; + } +} +% else: +memset(${grad_weight}, 0, ${ch_im_out} * ${dim_kernel_x} * ${dim_kernel_y} * sizeof(${grad_weight_type.referencedType.typeName})); +% endif + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_DWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW( + ref_${grad_weight}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_weight}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_weight}_out, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + + ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; + ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; +} + +""") + + +referenceDWConvGradX2DTiledTemplate = NodeTemplate(""" +// 2D FP DW ConvGradX (dX) CHW tiled (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_out} = ${grad_out}; // dY +${weight_type.typeName} ref_${weight} = ${weight}; // W +${grad_in_type.typeName} ref_${grad_in}_out = ${grad_in}; // dX + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_DWConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW_tiled( + ref_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${weight}, + ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${grad_in}_out, + ${dim_im_in_x}, ${dim_im_in_y}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${offset_grad_in_h}, ${offset_grad_in_w}, + ${offset_grad_out_h}, ${offset_grad_out_w} + ); + + ref_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_in}_out += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") + +# ============================================================================ +# Pointwise Convolution Gradient Templates +# ============================================================================ + +class PULP2DFloatPWConvGradXTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # Transpose buffer for weight matrix transpose (C_out x C_in) + # For pointwise convolution, kernel size is 1x1 + bt_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * \ + operatorRepresentation['ch_im_in'] * operatorRepresentation['ch_im_out'] + + bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer" + + return [(bt_name, bt_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + bt_name, bt_dim = PULP2DFloatPWConvGradXTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + + ctxt.hoistTransientBuffer(bt_name, bt_dim) + + operatorRepresentation['transposeBuffer'] = bt_name + operatorRepresentation['transposeBufferSize'] = bt_dim + + return ctxt, operatorRepresentation, [bt_name] + + +referencePWConvGradW2DTemplate = _ConvGradWTemplate(""" +// 2D FP Pointwise ConvGradW (1x1) NCHW using pulp-trainlib pw interface (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out}; +${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in}; +${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight}; + +% if tileIdxPtr != 'NULL': +{ + static uint32_t ${nodeName}_last_step = 0xFFFFFFFFu; + if ((uint32_t)*${tileIdxPtr} != ${nodeName}_last_step) { + memset(${grad_weight}, 0, ${ch_im_out} * ${ch_im_in} * sizeof(${grad_weight_type.referencedType.typeName})); + ${nodeName}_last_step = (uint32_t)*${tileIdxPtr}; + } +} +% else: +memset(${grad_weight}, 0, ${ch_im_out} * ${ch_im_in} * sizeof(${grad_weight_type.referencedType.typeName})); +% endif + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_PWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW( + ref_${grad_weight}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_weight}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ref_${grad_weight}_out + ); +} + + ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; + +""") + +referencePWConvGradX2DTemplate = PULP2DFloatPWConvGradXTemplate(""" +// 2D FP Pointwise ConvGradX (1x1) CHW using pulp-trainlib pw interface (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_in}_${grad_out} = ${grad_out}; // dY +${weight_type.typeName} ref_${grad_in}_${weight} = ${weight}; // W +${grad_in_type.typeName} ref_${grad_in}_out = ${grad_in}; // dX + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_PWConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW( + ref_${grad_in}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_in}_${weight}, + ${ch_im_in}, + ref_${grad_in}_out, + ${dim_im_in_x}, ${dim_im_in_y}, + ${transposeBuffer}, ${transposeBufferSize} + ); + + ref_${grad_in}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_in}_out += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} + +""") + + +# Template for ConvGradB: dB[c] = sum_{n,h,w} dY[n,c,h,w] +referenceConvGradB2DTemplate = NodeTemplate(""" +// 2D FP ConvGradB: bias gradient = sum dY over N,H,W (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_dB_dy = ${grad_out}; +${grad_bias_type.typeName} ref_dB_db = ${grad_bias}; +for (uint32_t c = 0; c < ${ch_im_out}; ++c) { + ref_dB_db[c] = 0.0f; + for (uint32_t n = 0; n < ${batch}; ++n) { + for (uint32_t h = 0; h < ${dim_im_out_y}; ++h) { + for (uint32_t w = 0; w < ${dim_im_out_x}; ++w) { + ref_dB_db[c] += ref_dB_dy[n * ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x} + c * ${dim_im_out_y} * ${dim_im_out_x} + h * ${dim_im_out_x} + w]; + } + } + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index bfa893db94..29ebdb6b46 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -158,3 +158,4 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; } """) + diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index 59499706e5..ef046f191d 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -4,7 +4,8 @@ from typing import Dict, List, Tuple -from Deeploy.AbstractDataTypes import float32_tPtr +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation @@ -19,7 +20,7 @@ def alignToContext(self, ctxt: NetworkContext, if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None: # No bias case - set C to NULL and provide a default type operatorRepresentation['C'] = None - operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type + operatorRepresentation['C_type'] = PointerClass(float32_t) # Default to fp32 type operatorRepresentation['C_batched'] = False return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGlobalAveragePoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGlobalAveragePoolTemplate.py new file mode 100644 index 0000000000..fe0503c834 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGlobalAveragePoolTemplate.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Forward: GlobalAveragePool +# Inputs: data_in [N, C, H, W] +# Outputs: data_out [N, C, 1, 1] (stored as N*C elements) +globalAveragePoolTemplate = NodeTemplate(""" +// GlobalAveragePool (Name: ${nodeName}, Op: ${nodeOp}) +PULP_GlobalAveragePool_fp32( + ${data_in}, + ${data_out}, + ${batch}, + ${channels}, + ${dim_im_in_x}, + ${dim_im_in_y} +); +""") + +# Backward: GlobalAveragePoolGrad +# Inputs: dY [N, C, 1, 1] (stored as N*C elements) +# Outputs: dX [N, C, H, W] +globalAveragePoolGradTemplate = NodeTemplate(""" +// GlobalAveragePoolGrad (Name: ${nodeName}, Op: ${nodeOp}) +PULP_GlobalAveragePoolGrad_fp32( + ${dY}, + ${dX}, + ${batch}, + ${channels}, + ${H}, + ${W} +); +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py new file mode 100644 index 0000000000..2c01219dbd --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatInPlaceAccumulatorV2Template.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer + + +class _PULPInPlaceAccumulatorV2Template(NodeTemplate): + """True in-place InPlaceAccumulatorV2 template for PULP. + + Writes the result directly into accum_buffer (the graph input) rather + than into a separate data_out buffer. data_out is registered as an + alias of accum_buffer so the memory allocator knows they share memory + and will not free accum_buffer prematurely. + + Semantics: + if lazy_reset_grad: accum_buffer = gradient (reset) + else: accum_buffer += gradient (accumulate) + """ + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: + accum_buffer = ctxt.lookup(operatorRepresentation['accum_buffer']) + data_out = ctxt.lookup(operatorRepresentation['data_out']) + + accum_buffer.aliases.add(data_out.name) + data_out.aliases.add(accum_buffer.name) + data_out._alias = accum_buffer.name + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _PULPInPlaceAccumulatorV2Template(""" +// InPlaceAccumulatorV2 - true in-place (Name: ${nodeName}, Op: ${nodeOp}) +// Writes result to accum_buffer (in-place) and data_out (explicit output). +// In training, data_out aliases accum_buffer (same or separate allocation). +// Reset (lazy_reset_grad=1): accum_buffer = gradient +// Accum (lazy_reset_grad=0): accum_buffer += gradient +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (int32_t)${size}); +int32_t ${nodeName}_stop = MIN(${nodeName}_start + ${nodeName}_chunk, (int32_t)${size}); + +if (${lazy_reset_grad}[0]) { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] = ${gradient}[i]; + ${data_out}[i] = ${gradient}[i]; + } +} else { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] += ${gradient}[i]; + ${data_out}[i] = ${accum_buffer}[i]; + } +} +""") + +# Tiled variant: writes only to ${accum_buffer} (no ${data_out} write). +# In the tiled context the optimizer reads the gradient directly from +# accum_buffer's L2 address (input_4/input_5). data_out's L2 address may +# overlap with other live buffers, so writing to it via DMA would corrupt L2. +# Omitting ${data_out} means we do not need a DMA egress for it at all. +tiledReferenceTemplate = _PULPInPlaceAccumulatorV2Template(""" +// InPlaceAccumulatorV2 - tiled in-place (Name: ${nodeName}, Op: ${nodeOp}) +// Tiled variant: result written only to accum_buffer (egressed to L2 by DMA). +// data_out is NOT written here — optimizer reads gradient from accum_buffer. +// Reset (lazy_reset_grad=1): accum_buffer = gradient +// Accum (lazy_reset_grad=0): accum_buffer += gradient +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (int32_t)${size}); +int32_t ${nodeName}_stop = MIN(${nodeName}_start + ${nodeName}_chunk, (int32_t)${size}); + +if (${lazy_reset_grad}[0]) { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] = ${gradient}[i]; + } +} else { + for (int32_t i = ${nodeName}_start; i < ${nodeName}_stop; i++) { + ${accum_buffer}[i] += ${gradient}[i]; + } +} +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py index 88aac71220..7e379b5265 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py @@ -6,11 +6,14 @@ referenceTemplate = NodeTemplate(""" // Float Layernorm (Name: ${nodeName}, Op: ${nodeOp}) +// Outputs: Y (data_out), mean stash, inv_std_dev stash PULP_Layernorm_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( ${data_in}, ${data_out}, ${weight}, ${bias}, + ${mean}, + ${inv_std_dev}, ${size}, ${lastDimLength}, ${epsilon} @@ -19,6 +22,7 @@ referenceGradTemplate = NodeTemplate(""" // FloatLayernormGrad Parallel (Name: ${nodeName}, Op: ${nodeOp}) +// Uses pre-computed mean/inv_std_dev stash from forward pass int8_t ${nodeName}_core_id = pi_core_id(); int8_t ${nodeName}_log2Core = log2(NUM_CORES); @@ -36,17 +40,43 @@ const float${grad_in_type.referencedType.typeWidth}_t* ${nodeName}_grad_in_ptr = ${grad_in} + ${nodeName}_elem_start; const float${data_in_type.referencedType.typeWidth}_t* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start; float${grad_out_type.referencedType.typeWidth}_t* ${nodeName}_grad_out_ptr = ${grad_out} + ${nodeName}_elem_start; +const float${mean_type.referencedType.typeWidth}_t* ${nodeName}_mean_ptr = ${mean} + ${nodeName}_start; +const float${inv_std_dev_type.referencedType.typeWidth}_t* ${nodeName}_inv_std_dev_ptr = ${inv_std_dev} + ${nodeName}_start; +// Zero-initialize weight_grad/bias_grad on first tile (accumulation across seq tiles) +static uint8_t ${nodeName}_param_initialized = 0; +if (!${nodeName}_param_initialized) { + memset(${weight_grad}, 0, ${lastDimLength} * sizeof(float${grad_out_type.referencedType.typeWidth}_t)); + memset(${bias_grad}, 0, ${lastDimLength} * sizeof(float${grad_out_type.referencedType.typeWidth}_t)); + ${nodeName}_param_initialized = 1; +} + +// Parallel: compute dX for each core's chunk of sequences using stash if (${nodeName}_elem_count > 0) { - LayernormGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}( - ${nodeName}_grad_in_ptr, // Upstream gradient (dy) - ${nodeName}_data_in_ptr, // Original input (x) - ${nodeName}_grad_out_ptr, // Output gradient (dx) - ${weight}, // Input Scale parameter - ${bias}, // Input Bias parameter - ${epsilon}, // Epsilon for numerical stability - ${nodeName}_elem_count, // Number of elements to process - ${lastDimLength} // Size of the feature dimension + PULP_LayernormGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}( + ${nodeName}_grad_in_ptr, // Upstream gradient (dY) - chunk + ${nodeName}_data_in_ptr, // Original input (X) - chunk + ${nodeName}_mean_ptr, // Stash mean - chunk + ${nodeName}_inv_std_dev_ptr, // Stash inv_std_dev - chunk + ${nodeName}_grad_out_ptr, // Output gradient (dX) - chunk + ${weight}, // Scale parameter (gamma) + ${nodeName}_elem_count, // Number of elements to process + ${lastDimLength} // Size of the feature dimension ); } -""") \ No newline at end of file + +// Core 0 only: accumulate dscale (weight_grad) and dbias (bias_grad) for this seq tile +if (${nodeName}_core_id == 0) { + for (uint32_t ${nodeName}_s = 0; ${nodeName}_s < (uint32_t)${nodeName}_seq_length; ${nodeName}_s++) { + const float${grad_in_type.referencedType.typeWidth}_t* ${nodeName}_dy_s = ${grad_in} + ${nodeName}_s * ${lastDimLength}; + const float${data_in_type.referencedType.typeWidth}_t* ${nodeName}_x_s = ${data_in} + ${nodeName}_s * ${lastDimLength}; + float${mean_type.referencedType.typeWidth}_t ${nodeName}_mu = ${mean}[${nodeName}_s]; + float${inv_std_dev_type.referencedType.typeWidth}_t ${nodeName}_isd = ${inv_std_dev}[${nodeName}_s]; + for (uint32_t ${nodeName}_c = 0; ${nodeName}_c < (uint32_t)${lastDimLength}; ${nodeName}_c++) { + float${data_in_type.referencedType.typeWidth}_t ${nodeName}_xhat = (${nodeName}_x_s[${nodeName}_c] - ${nodeName}_mu) * ${nodeName}_isd; + ${weight_grad}[${nodeName}_c] += ${nodeName}_dy_s[${nodeName}_c] * ${nodeName}_xhat; + ${bias_grad}[${nodeName}_c] += ${nodeName}_dy_s[${nodeName}_c]; + } + } +} +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py index 846aeae92d..3134d6c6be 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py @@ -22,4 +22,29 @@ ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; } +""") + +referenceGradTemplate = NodeTemplate(""" +// 2D Float MaxPoolGrad Channel Parallel (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${x_in_type.typeName} ref_${data_out}_${x_in} = ${x_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + PULP_MaxPoolGrad2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ref_${data_out}_${x_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_im_out_x}, ${dim_im_out_y}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${data_out}_${data_out}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + + ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${data_out}_${x_in} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; + ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; +} """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py index ab22b75bee..de9a4e47fc 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py @@ -11,4 +11,15 @@ ${data_out}, ${size} ); +""") + +referenceGradTemplate = NodeTemplate(""" +// ReLU Grad (Name: ${nodeName}, Op: ${nodeOp}) + +PULP_ReluGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}( + ${grad_out}, + ${data_in}, + ${grad_in}, + ${size} +); """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/MSELossTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MSELossTemplate.py new file mode 100644 index 0000000000..4d70b92b1c --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/MSELossTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +BEGIN_SINGLE_CORE + // MSELoss (Name: ${nodeName}, Op: ${nodeOp}) + // loss = mean((pred - target)^2) + float32_t mse_sum = 0.0f; + for (uint32_t i = 0; i < ${num_elements}; i++) { + float32_t mse_d = ${pred}[i] - ${target}[i]; + mse_sum += mse_d * mse_d; + } + ${loss}[0] = mse_sum / (float32_t)${num_elements}; + printf(" [MSE] loss=%.6f\\r\\n", (double)${loss}[0]); +END_SINGLE_CORE +""") + +referenceGradientTemplate = NodeTemplate(""" +BEGIN_SINGLE_CORE + // MSELossGrad (Name: ${nodeName}, Op: ${nodeOp}) + // grad = 2 * (pred - target) / N + float32_t mse_grad_scale = 2.0f / (float32_t)${num_elements}; + for (uint32_t i = 0; i < ${num_elements}; i++) { + ${grad}[i] = mse_grad_scale * (${pred}[i] - ${target}[i]); + } +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py index 1592fe30c4..da27aab47c 100644 --- a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py @@ -2,9 +2,42 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from typing import List, Tuple -referenceTemplate = NodeTemplate(""" +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer + + +class _PULPSGDTemplate(NodeTemplate): + """In-place SGD template for PULP. + + weight_updated is aliased to weight so the memory allocator places them + at the same L2 address. This ensures the tiled egress DMA writes the + updated weight back to weight's L2 buffer — the same buffer the training + network reads from on the next forward pass. + """ + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: + weight = ctxt.lookup(operatorRepresentation['weight']) + weight_updated = ctxt.lookup(operatorRepresentation['weight_updated']) + + weight.aliases.add(weight_updated.name) + weight_updated.aliases.add(weight.name) + weight_updated._alias = weight.name + + # Make weight_updated share weight's L2 allocation (no separate malloc). + # The egress DMA then writes updated weights back to weight's L2 address. + weight_updated.allocTemplate = NodeTemplate( + " ${name} = (${type.typeName}) " + str(weight._instance) + ";") + weight_updated.deallocTemplate = NodeTemplate("") + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _PULPSGDTemplate(""" // SGD Weight Update with Separated Multiplication and Subtraction Unrolling // (Name: ${nodeName}, Op: ${nodeOp}) int8_t ${nodeName}_core_id = pi_core_id(); @@ -46,4 +79,4 @@ float32_t temp_grad = learning_rate * ref_${grad}[i]; ref_${weight_updated}[i] = ref_${weight}[i] - temp_grad; } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py index c1aefe01a3..4a3da4b3ee 100644 --- a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py @@ -28,6 +28,31 @@ END_SINGLE_CORE """) +referenceDualOutputTemplate = NodeTemplate(""" +BEGIN_SINGLE_CORE + // SoftmaxCrossEntropyLoss dual-output (Name: ${nodeName}, Op: ${nodeOp}) + float32_t sce_total_loss = 0.0f; + for (uint32_t i = 0; i < ${batch}; i++) { + float32_t sce_max_logit = ${logits}[i * ${num_classes}]; + for (uint32_t j = 1; j < ${num_classes}; j++) { + if (${logits}[i * ${num_classes} + j] > sce_max_logit) + sce_max_logit = ${logits}[i * ${num_classes} + j]; + } + float32_t sce_sum_exp = 0.0f; + for (uint32_t j = 0; j < ${num_classes}; j++) + sce_sum_exp += expf(${logits}[i * ${num_classes} + j] - sce_max_logit); + float32_t sce_log_sum_exp = logf(sce_sum_exp); + for (uint32_t j = 0; j < ${num_classes}; j++) + ${log_prob}[i * ${num_classes} + j] = + ${logits}[i * ${num_classes} + j] - sce_max_logit - sce_log_sum_exp; + sce_total_loss += -(${logits}[i * ${num_classes} + (uint32_t)(${labels}[i])] + - sce_max_logit - sce_log_sum_exp); + } + ${loss}[0] = sce_total_loss / (float32_t)${batch}; + printf(" [SCE] loss=%.6f\\r\\n", (double)${loss}[0]); +END_SINGLE_CORE +""") + referenceGradientTemplate = NodeTemplate(""" BEGIN_SINGLE_CORE // SoftmaxCrossEntropyLossGrad (Name: ${nodeName}, Op: ${nodeOp}) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/AveragePoolTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/AveragePoolTileConstraint.py new file mode 100644 index 0000000000..7caa030bb7 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/AveragePoolTileConstraint.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class AveragePoolHWTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + strides = parseDict["strides"] + padding = parseDict["pads"] + kernelShape = parseDict['kernel_shape'] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBuffer.name, outputBuffer.name]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch + tilerModel.addConstraint(outputChannelVar == inputChannelVar) # Channel + + effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1])) + effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2])) + + tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (kernelShape[0] - 1) - 1) // strides[0] + 1)) + tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (kernelShape[1] - 1) - 1) // strides[1] + 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3) + + strides = parseDict["strides"] + + # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) + tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y']) + + tilerModel.addConstraint((inputHeightVar % strides[0]) == 0) + tilerModel.addConstraint((inputWidthVar % strides[1]) == 0) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + varOut = operatorRepresentation['data_out'] + + inputInCubes = [] + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_in": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [] + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_in": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t) + } + + kernelShape = operatorRepresentation['kernel_shape'] + pads = operatorRepresentation['pads'] + strides = operatorRepresentation['strides'] + + for cube in outputCubes: + (BatchOffset, HOffset, WOffset, COffset) = cube.offset + (BatchSize, HSize, WSize, CSize) = cube.dims + + InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((kernelShape[0], kernelShape[1]), pads, + strides, CSize, cube, + ctxt.lookup(varOut).shape) + padding_left, padding_right, padding_top, padding_bottom = padding_tuple + + replacements['dim_im_in_x'].append(InCube.dims[1]) + replacements['dim_im_in_y'].append(InCube.dims[2]) + replacements['dim_im_out_x'].append(HSize) + replacements['dim_im_out_y'].append(WSize) + replacements['ch_im_in'].append(CSize) + + replacements['padding_y_top'].append(padding_top) + replacements['padding_y_bottom'].append(padding_bottom) + replacements['padding_x_left'].append(padding_left) + replacements['padding_x_right'].append(padding_right) + + inputInCubes.append(InCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for a in inputInCubes: + inputLoadSchedule.append({"data_in": a}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule + + +# RW: This constraint tiles the channels of avgpool, which avoids issues with padding margin calculations +# when the default memory level is L3. + + +class AveragePoolCTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + numDims = len(ctxt.lookup(inputBufferName).shape) + + for bufferName in [inputBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # RW: Apply constraints only to the Channel dimension + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = numDims - 1) == + tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = numDims - 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + numDims = len(ctxt.lookup(inputBufferName).shape) + + for idx in range(numDims): + if idx != numDims - 1: # RW: Keep all dimensions except C (index 1 in NCHW) fixed + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) == ctxt.lookup( + outputBufferName).shape[idx]) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx) == ctxt.lookup( + inputBufferName).shape[idx]) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + inputInCubes = [] + replacementTypes = {} + replacements: Dict[str, List[int]] = {} + + numDims = len(ctxt.lookup(operatorRepresentation['data_in']).shape) + replacementTypes["ch_im_in"] = PointerClass(uint16_t) + replacements["ch_im_in"] = [] + + input_shape = ctxt.lookup(operatorRepresentation['data_in']).shape + output_shape = ctxt.lookup(operatorRepresentation['data_out']).shape + + for cube in outputCubes: + input_offset = list(cube.offset) + input_dims = list(input_shape) + input_dims[-1] = cube.dims[-1] + InCube = HyperRectangle(tuple(input_offset), tuple(input_dims)) + inputInCubes.append(InCube) + + replacements["ch_im_in"].append(input_dims[-1]) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for a in inputInCubes: + inputLoadSchedule.append({"data_in": a}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/BatchNormTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/BatchNormTileConstraint.py new file mode 100644 index 0000000000..f09bdd7c04 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/BatchNormTileConstraint.py @@ -0,0 +1,771 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class BatchNormInternalTileConstraint(TileConstraint): + """Tile constraint for BatchNormInternal (ORT training-mode BN forward pass). + + Inputs: X[N,C,H,W], gamma[C], beta[C], running_mean[C], running_var[C] + Outputs: Y[N,C,H,W] (primary), saved_mean[C], saved_inv_std[C] (secondary) + (updated_running_mean/var are not registered — no consumers) + + Tiling strategy: tile along C (channels are independent). + - N, H_in, W_in are pinned to full size: BN needs all N*H*W elements + per channel to compute per-channel mean and variance. + - C is free: channels are fully independent. + - Per-channel vectors (gamma, beta, running_mean, running_var, + saved_mean, saved_inv_std) tile with the C dimension of data_in. + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + data_in_name = parseDict['data_in'] + data_out_name = parseDict['data_out'] + scale_name = parseDict['scale'] + bias_name = parseDict['bias'] + running_mean_name = parseDict['running_mean'] + running_var_name = parseDict['running_var'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + updated_running_mean_name = parseDict.get('updated_running_mean', '') + updated_running_var_name = parseDict.get('updated_running_var', '') + + for name in [ + data_in_name, data_out_name, scale_name, bias_name, running_mean_name, running_var_name, + saved_mean_name, saved_inv_std_name + ]: + tilerModel.addTensorDimToModel(ctxt, name) + + for name in [updated_running_mean_name, updated_running_var_name]: + if name: + tilerModel.addTensorDimToModel(ctxt, name) + + input_shape = ctxt.lookup(data_in_name).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + + # Pin N, H_in, W_in: BN statistics require all spatial/batch elements per channel + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=2) == H_in) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=3) == W_in) + + # data_out has the same shape as data_in + for idx in range(4): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=data_out_name, dimIdx=idx)) + + # Per-channel vectors: single dimension follows C (dim 1 of data_in) + for vec_name in [scale_name, bias_name, running_mean_name, running_var_name, saved_mean_name, + saved_inv_std_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=1)) + + for vec_name in [updated_running_mean_name, updated_running_var_name]: + if vec_name: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=1)) + + return tilerModel + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + dataOutVar = operatorRepresentation['data_out'] + + # Pass a single-output solution to satisfy the base-class assertion + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + dataOutVar: tilingSolution.outputTensorMemoryConstraints[dataOutVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + # Extend each schedule to include saved_mean and saved_inv_std outputs + for secondary in ['saved_mean', 'saved_inv_std']: + secondaryVar = operatorRepresentation.get(secondary, '') + if not secondaryVar: + continue + if secondaryVar not in tilingSolution.outputTensorMemoryConstraints: + continue + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr == [None]: + continue + for schedule in tilingSchedules: + schedule.outputBaseOffsets[secondary] = addr + for step in schedule.outputLoadSchedule: + data_out_rect = step['data_out'] + # Per-channel slice corresponding to the C tile in data_out + c_start = data_out_rect.offset[1] + c_tile = data_out_rect.dims[1] + step[secondary] = HyperRectangle((c_start,), (c_tile,)) + + return varReplacement, tilingSchedules + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Global model parameters (scale/bias/running_mean/running_var) may be excluded + # from the tiling solution by _checkResolve (global tensors with 1 memory level). + # Apply the same _in_solution guard as ConvTileConstraint.bias_in_solution. + scale_in_solution = operatorRepresentation['scale'] in tilingSolution.tensorMemoryConstraints + bias_in_solution = operatorRepresentation['bias'] in tilingSolution.tensorMemoryConstraints + running_mean_in_solution = operatorRepresentation['running_mean'] in tilingSolution.tensorMemoryConstraints + running_var_in_solution = operatorRepresentation['running_var'] in tilingSolution.tensorMemoryConstraints + + addr_names = ['data_in', 'data_out'] + if scale_in_solution: + addr_names.append('scale') + if bias_in_solution: + addr_names.append('bias') + if running_mean_in_solution: + addr_names.append('running_mean') + if running_var_in_solution: + addr_names.append('running_var') + + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": []} + replacement_types = {"C": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + # cube is the tile of data_out: [N, C_tile, H_in, W_in] + C_tile = cube.dims[1] + c_start = cube.offset[1] + + replacements["C"].append(C_tile) + + # Per-channel vector tile: offset=(c_start,), dims=(C_tile,) + vec_cube = HyperRectangle((c_start,), (C_tile,)) + + entry = {"data_in": cube} + if scale_in_solution: + entry["scale"] = vec_cube + if bias_in_solution: + entry["bias"] = vec_cube + if running_mean_in_solution: + entry["running_mean"] = vec_cube + if running_var_in_solution: + entry["running_var"] = vec_cube + input_load_schedule.append(entry) + output_load_schedule.append({"data_out": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class WelfordReduceTileConstraint(TileConstraint): + """Tile constraint for WelfordReduce (split BN forward reduction). + + Inputs: X[N,C,H,W] + Outputs: saved_mean[C], saved_inv_std[C] + + Tiling: C is free; N, H, W are pinned (reduction over spatial). + Memory per tile: X_tile[1,C_tile,H,W] + mean[C_tile] + inv_std[C_tile] + → much smaller than monolithic BN (no Y needed during reduction). + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + data_in_name = parseDict['data_in'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + + for name in [data_in_name, saved_mean_name, saved_inv_std_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + input_shape = ctxt.lookup(data_in_name).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + + # Pin N, H, W: reduction needs full spatial + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=2) == H_in) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=3) == W_in) + + # Per-channel outputs follow C + for vec_name in [saved_mean_name, saved_inv_std_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=1)) + + return tilerModel + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + # Primary output is saved_mean; add saved_inv_std as secondary + savedMeanVar = operatorRepresentation['saved_mean'] + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + savedMeanVar: tilingSolution.outputTensorMemoryConstraints[savedMeanVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + secondaryVar = operatorRepresentation['saved_inv_std'] + if secondaryVar in tilingSolution.outputTensorMemoryConstraints: + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr != [None]: + for schedule in tilingSchedules: + schedule.outputBaseOffsets['saved_inv_std'] = addr + for step in schedule.outputLoadSchedule: + mean_rect = step['saved_mean'] + step['saved_inv_std'] = HyperRectangle(mean_rect.offset, mean_rect.dims) + + return varReplacement, tilingSchedules + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + addr_names = ['data_in'] + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": []} + replacement_types = {"C": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + # cube is the tile of saved_mean: (C_tile,) + C_tile = cube.dims[0] + c_start = cube.offset[0] + + replacements["C"].append(C_tile) + + input_shape = ctxt.lookup(operatorRepresentation['data_in']).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + data_in_cube = HyperRectangle((0, c_start, 0, 0), (N, C_tile, H_in, W_in)) + + input_load_schedule.append({"data_in": data_in_cube}) + output_load_schedule.append({"saved_mean": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class ChannelNormalizeTileConstraint(TileConstraint): + """Tile constraint for ChannelNormalize (split BN forward elementwise). + + Inputs: X[N,C,H,W], saved_mean[C], saved_inv_std[C], gamma[C], beta[C] + Outputs: Y[N,C,H,W] + + Tiling: C, H, W are all free (elementwise per-channel op). + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + data_in_name = parseDict['data_in'] + data_out_name = parseDict['data_out'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + gamma_name = parseDict['gamma'] + beta_name = parseDict['beta'] + + for name in [data_in_name, data_out_name, saved_mean_name, saved_inv_std_name, gamma_name, beta_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + # data_out has the same shape as data_in + for idx in range(4): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=data_out_name, dimIdx=idx)) + + # Per-channel vectors follow C (dim 1) + for vec_name in [saved_mean_name, saved_inv_std_name, gamma_name, beta_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + saved_mean_in_solution = operatorRepresentation['saved_mean'] in tilingSolution.tensorMemoryConstraints + saved_inv_std_in_solution = operatorRepresentation['saved_inv_std'] in tilingSolution.tensorMemoryConstraints + gamma_in_solution = operatorRepresentation['gamma'] in tilingSolution.tensorMemoryConstraints + beta_in_solution = operatorRepresentation['beta'] in tilingSolution.tensorMemoryConstraints + + addr_names = ['data_in', 'data_out'] + if saved_mean_in_solution: + addr_names.append('saved_mean') + if saved_inv_std_in_solution: + addr_names.append('saved_inv_std') + if gamma_in_solution: + addr_names.append('gamma') + if beta_in_solution: + addr_names.append('beta') + + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": [], "H_in": [], "W_in": []} + replacement_types = {"C": PointerClass(uint16_t), "H_in": PointerClass(uint16_t), + "W_in": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + C_tile = cube.dims[1] + H_tile = cube.dims[2] + W_tile = cube.dims[3] + c_start = cube.offset[1] + + replacements["C"].append(C_tile) + replacements["H_in"].append(H_tile) + replacements["W_in"].append(W_tile) + + vec_cube = HyperRectangle((c_start,), (C_tile,)) + + entry = {"data_in": cube} + if saved_mean_in_solution: + entry["saved_mean"] = vec_cube + if saved_inv_std_in_solution: + entry["saved_inv_std"] = vec_cube + if gamma_in_solution: + entry["gamma"] = vec_cube + if beta_in_solution: + entry["beta"] = vec_cube + input_load_schedule.append(entry) + output_load_schedule.append({"data_out": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class BNGradReduceTileConstraint(TileConstraint): + """Tile constraint for BNGradReduce (split BN backward reduction). + + Inputs: dY[N,C,H,W], X[N,C,H,W], saved_mean[C], saved_inv_std[C] + Outputs: dgamma[C], dbeta[C] + + Tiling: C is free; N, H, W are pinned (reduction over spatial). + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dY_name = parseDict['dY'] + X_name = parseDict['X'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + dgamma_name = parseDict['dgamma'] + dbeta_name = parseDict['dbeta'] + + for name in [dY_name, X_name, saved_mean_name, saved_inv_std_name, dgamma_name, dbeta_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + input_shape = ctxt.lookup(dY_name).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + + # Pin N, H, W + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=2) == H_in) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=3) == W_in) + + # X has the same shape as dY + for idx in range(4): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=X_name, dimIdx=idx)) + + # Per-channel vectors follow C + for vec_name in [saved_mean_name, saved_inv_std_name, dgamma_name, dbeta_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=1)) + + return tilerModel + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + dgammaVar = operatorRepresentation['dgamma'] + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + dgammaVar: tilingSolution.outputTensorMemoryConstraints[dgammaVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + secondaryVar = operatorRepresentation['dbeta'] + if secondaryVar in tilingSolution.outputTensorMemoryConstraints: + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr != [None]: + for schedule in tilingSchedules: + schedule.outputBaseOffsets['dbeta'] = addr + for step in schedule.outputLoadSchedule: + dgamma_rect = step['dgamma'] + step['dbeta'] = HyperRectangle(dgamma_rect.offset, dgamma_rect.dims) + + return varReplacement, tilingSchedules + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + saved_mean_in_solution = operatorRepresentation['saved_mean'] in tilingSolution.tensorMemoryConstraints + saved_inv_std_in_solution = operatorRepresentation['saved_inv_std'] in tilingSolution.tensorMemoryConstraints + + addr_names = ['dY', 'X'] + if saved_mean_in_solution: + addr_names.append('saved_mean') + if saved_inv_std_in_solution: + addr_names.append('saved_inv_std') + + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": []} + replacement_types = {"C": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + # cube is the tile of dgamma: (C_tile,) + C_tile = cube.dims[0] + c_start = cube.offset[0] + + replacements["C"].append(C_tile) + + input_shape = ctxt.lookup(operatorRepresentation['dY']).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + data_cube = HyperRectangle((0, c_start, 0, 0), (N, C_tile, H_in, W_in)) + vec_cube = HyperRectangle((c_start,), (C_tile,)) + + entry = {"dY": data_cube, "X": data_cube} + if saved_mean_in_solution: + entry["saved_mean"] = vec_cube + if saved_inv_std_in_solution: + entry["saved_inv_std"] = vec_cube + input_load_schedule.append(entry) + output_load_schedule.append({"dgamma": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class BNGradNormalizeTileConstraint(TileConstraint): + """Tile constraint for BNGradNormalize (split BN backward elementwise). + + Inputs: dY[N,C,H,W], X[N,C,H,W], saved_mean[C], saved_inv_std[C], + gamma[C], dgamma[C], dbeta[C] + Outputs: dX[N,C,H,W] + + Tiling: C, H, W are all free (elementwise per-channel op). + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dY_name = parseDict['dY'] + X_name = parseDict['X'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + gamma_name = parseDict['gamma'] + dgamma_name = parseDict['dgamma'] + dbeta_name = parseDict['dbeta'] + dX_name = parseDict['dX'] + + for name in [dY_name, X_name, saved_mean_name, saved_inv_std_name, gamma_name, dgamma_name, dbeta_name, + dX_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + # dY, X, dX must have the same shape + for idx in range(4): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=X_name, dimIdx=idx)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=idx)) + + # Per-channel vectors follow C (dim 1) + for vec_name in [saved_mean_name, saved_inv_std_name, gamma_name, dgamma_name, dbeta_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + saved_mean_in_solution = operatorRepresentation['saved_mean'] in tilingSolution.tensorMemoryConstraints + saved_inv_std_in_solution = operatorRepresentation['saved_inv_std'] in tilingSolution.tensorMemoryConstraints + gamma_in_solution = operatorRepresentation['gamma'] in tilingSolution.tensorMemoryConstraints + dgamma_in_solution = operatorRepresentation['dgamma'] in tilingSolution.tensorMemoryConstraints + dbeta_in_solution = operatorRepresentation['dbeta'] in tilingSolution.tensorMemoryConstraints + + addr_names = ['dY', 'X', 'dX'] + if saved_mean_in_solution: + addr_names.append('saved_mean') + if saved_inv_std_in_solution: + addr_names.append('saved_inv_std') + if gamma_in_solution: + addr_names.append('gamma') + if dgamma_in_solution: + addr_names.append('dgamma') + if dbeta_in_solution: + addr_names.append('dbeta') + + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": [], "H_in": [], "W_in": []} + replacement_types = {"C": PointerClass(uint16_t), "H_in": PointerClass(uint16_t), + "W_in": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + C_tile = cube.dims[1] + H_tile = cube.dims[2] + W_tile = cube.dims[3] + c_start = cube.offset[1] + + replacements["C"].append(C_tile) + replacements["H_in"].append(H_tile) + replacements["W_in"].append(W_tile) + + vec_cube = HyperRectangle((c_start,), (C_tile,)) + + entry = {"dY": cube, "X": cube} + if saved_mean_in_solution: + entry["saved_mean"] = vec_cube + if saved_inv_std_in_solution: + entry["saved_inv_std"] = vec_cube + if gamma_in_solution: + entry["gamma"] = vec_cube + if dgamma_in_solution: + entry["dgamma"] = vec_cube + if dbeta_in_solution: + entry["dbeta"] = vec_cube + input_load_schedule.append(entry) + output_load_schedule.append({"dX": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class BatchNormalizationGradTileConstraint(TileConstraint): + """Tile constraint for BatchNormalizationGrad (BN backward pass). + + Inputs: dY[N,C,H,W], X[N,C,H,W], gamma[C], saved_mean[C], saved_inv_std[C] + Outputs: dX[N,C,H,W] (primary), dgamma[C], dbeta[C] (secondary) + + Tiling strategy: tile along C (channels are independent). + - N, H_in, W_in are pinned to full size: backward BN needs all N*H*W + elements per channel for dgamma/dbeta reductions. + - C is free: each channel's gradient is computed independently. + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dY_name = parseDict['dY'] + X_name = parseDict['X'] + gamma_name = parseDict['gamma'] + saved_mean_name = parseDict['saved_mean'] + saved_inv_std_name = parseDict['saved_inv_std'] + dX_name = parseDict['dX'] + dgamma_name = parseDict['dgamma'] + dbeta_name = parseDict['dbeta'] + + for name in [dY_name, X_name, gamma_name, saved_mean_name, saved_inv_std_name, dX_name, dgamma_name, + dbeta_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + input_shape = ctxt.lookup(dY_name).shape + N = input_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + + # Pin N, H_in, W_in: backward BN needs all spatial/batch elements for reductions + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=2) == H_in) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=3) == W_in) + + # X, dX must have the same shape as dY + for idx in range(4): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=X_name, dimIdx=idx)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=idx) == + tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=idx)) + + # Per-channel vectors: single dimension follows C (dim 1 of dY) + for vec_name in [gamma_name, saved_mean_name, saved_inv_std_name, dgamma_name, dbeta_name]: + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=vec_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=1)) + + return tilerModel + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + dXVar = operatorRepresentation['dX'] + + # Pass a single-output solution to satisfy the base-class assertion + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + dXVar: tilingSolution.outputTensorMemoryConstraints[dXVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + # Extend each schedule to include dgamma and dbeta outputs + for secondary in ['dgamma', 'dbeta']: + secondaryVar = operatorRepresentation.get(secondary, '') + if not secondaryVar: + continue + if secondaryVar not in tilingSolution.outputTensorMemoryConstraints: + continue + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr == [None]: + continue + for schedule in tilingSchedules: + schedule.outputBaseOffsets[secondary] = addr + for step in schedule.outputLoadSchedule: + dX_rect = step['dX'] + c_start = dX_rect.offset[1] + c_tile = dX_rect.dims[1] + step[secondary] = HyperRectangle((c_start,), (c_tile,)) + + return varReplacement, tilingSchedules + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + # gamma is a learnable model parameter (global) and may be excluded from the tiling + # solution by _checkResolve. saved_mean/saved_inv_std are transient BN forward outputs + # and are normally in the solution, but guard them too for robustness. + gamma_in_solution = operatorRepresentation['gamma'] in tilingSolution.tensorMemoryConstraints + saved_mean_in_solution = operatorRepresentation['saved_mean'] in tilingSolution.tensorMemoryConstraints + saved_inv_std_in_solution = operatorRepresentation['saved_inv_std'] in tilingSolution.tensorMemoryConstraints + + addr_names = ['dY', 'X', 'dX'] + if gamma_in_solution: + addr_names.append('gamma') + if saved_mean_in_solution: + addr_names.append('saved_mean') + if saved_inv_std_in_solution: + addr_names.append('saved_inv_std') + + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"C": []} + replacement_types = {"C": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + # cube is the tile of dX: [N, C_tile, H_in, W_in] + C_tile = cube.dims[1] + c_start = cube.offset[1] + + replacements["C"].append(C_tile) + + vec_cube = HyperRectangle((c_start,), (C_tile,)) + + entry = {"dY": cube, "X": cube} + if gamma_in_solution: + entry["gamma"] = vec_cube + if saved_mean_in_solution: + entry["saved_mean"] = vec_cube + if saved_inv_std_in_solution: + entry["saved_inv_std"] = vec_cube + input_load_schedule.append(entry) + output_load_schedule.append({"dX": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvGradConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvGradConstraint.py new file mode 100644 index 0000000000..19e847d3bb --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvGradConstraint.py @@ -0,0 +1,1023 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Optional, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ConvGradXTileConstraintBase(TileConstraint): + """ + Base for ConvGradX2D tiling: + + - absoluteOutputCubes are tiles of grad_in (dX) (operatorRepresentation[gradInKey]) + - for each dX tile, derive required grad_out (dY) halo tile + - weight is full (not tiled) + - emits unified template params: + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${offset_grad_in_h}, ${offset_grad_in_w}, ${offset_grad_out_h}, ${offset_grad_out_w} + """ + + # ---- parser/opRep keys (override in subclasses if needed) ---- + # In Deeploy ConvGradX parsers these are commonly "data_in" (dY) and "data_out" (dX). + gradOutKey = "grad_out" # dY + gradInKey = "grad_in" # dX + weightKey = "weight" # W + + # --------------------------- + # 1) Geometrical constraints + # --------------------------- + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dyName = parseDict[cls.gradOutKey] + dxName = parseDict[cls.gradInKey] + wName = parseDict[cls.weightKey] + + tilerModel.addTensorDimToModel(ctxt, dyName) + tilerModel.addTensorDimToModel(ctxt, dxName) + tilerModel.addTensorDimToModel(ctxt, wName) + + group = parseDict.get("group", 1) + + # N match + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 0) == tilerModel.getTensorDimVar(dxName, 0)) + + # Channel relations: + # dY: [N, C_out, H_out, W_out] + # dX: [N, C_in, H_in, W_in] + # W : [C_out, C_in/group, P, Q] + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == tilerModel.getTensorDimVar(wName, 0)) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dxName, 1) == tilerModel.getTensorDimVar(wName, 1) * group) + + return tilerModel + + # ----------------------- + # 2) Policy constraints + # ----------------------- + @classmethod + def addPolicyConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + """ + Default policy: + - keep full Cin/Cout + - weight not tiled + - allow spatial tiling on dX + """ + dyName = parseDict[cls.gradOutKey] + dxName = parseDict[cls.gradInKey] + wName = parseDict[cls.weightKey] + + dyBuf = ctxt.lookup(dyName) + dxBuf = ctxt.lookup(dxName) + wBuf = ctxt.lookup(wName) + + # full channels + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == dyBuf.shape[1]) # Cout full + tilerModel.addConstraint(tilerModel.getTensorDimVar(dxName, 1) == dxBuf.shape[1]) # Cin full + + # weight not tiled + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 0) == wBuf.shape[0]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 1) == wBuf.shape[1]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 2) == wBuf.shape[2]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 3) == wBuf.shape[3]) + + return tilerModel + + # ----------------------------------- + # 3) Symbolic node representation + # ----------------------------------- + @classmethod + def constructSymbolicNodeRep( + cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext + ) -> Dict[str, Union[int, IntVar]]: + """ + Bind template fields: + dim_im_out_* / ch_im_out : grad_out (dY) + dim_im_in_* / ch_im_in : grad_in (dX) + """ + dyName = parseDict[cls.gradOutKey] + dxName = parseDict[cls.gradInKey] + wName = parseDict[cls.weightKey] + + symbolic = parseDict.copy() + + # dY (grad_out) + symbolic["dim_im_out_x"] = tilerModel.getTensorDimVar(dyName, 2) # H_out tile + symbolic["dim_im_out_y"] = tilerModel.getTensorDimVar(dyName, 3) # W_out tile + symbolic["ch_im_out"] = tilerModel.getTensorDimVar(dyName, 1) # Cout + + # dX (grad_in) + symbolic["dim_im_in_x"] = tilerModel.getTensorDimVar(dxName, 2) # H_in tile + symbolic["dim_im_in_y"] = tilerModel.getTensorDimVar(dxName, 3) # W_in tile + symbolic["ch_im_in"] = tilerModel.getTensorDimVar(dxName, 1) # Cin + + # kernel (H,W) + symbolic["dim_kernel_x"] = tilerModel.getTensorDimVar(wName, 2) # P + symbolic["dim_kernel_y"] = tilerModel.getTensorDimVar(wName, 3) # Q + + # offsets filled in serialize + symbolic["offset_grad_in_h"] = 0 + symbolic["offset_grad_in_w"] = 0 + symbolic["offset_grad_out_h"] = 0 + symbolic["offset_grad_out_w"] = 0 + + return symbolic + + # ------------------------------- + # helpers + # ------------------------------- + @staticmethod + def _ceil_div(a: int, b: int) -> int: + return -((-a) // b) + + @staticmethod + def _floor_div(a: int, b: int) -> int: + return a // b + + @classmethod + def get_kernel_hw(cls, ctxt: NetworkContext, wName: str, wShape: Tuple[int, int, int, int]) -> Tuple[int, int]: + return wShape[2], wShape[3] + + @classmethod + def get_dy_channels( + cls, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation, + dyName: str, + dxName: str, + wName: str, + dyFull: Tuple[int, int, int, int], + dxFull: Tuple[int, int, int, int], + wShape: Tuple[int, int, int, int], + ) -> int: + # default ConvGradX: dy channels == weight[0] (Cout) + return wShape[0] + + @classmethod + def get_ch_im_out(cls, ctxt: NetworkContext, dyFull, dxFull, wShape) -> int: + # template's ch_im_out should match dY channels (Cout) + return dyFull[1] + + @classmethod + def get_ch_im_in(cls, ctxt: NetworkContext, dyFull, dxFull, wShape) -> int: + # template's ch_im_in should match dX channels (Cin) + return dxFull[1] + + @classmethod + def map_onnx_pads_to_template(cls, tpt: int, tpb: int, tpl: int, tpr: int) -> Tuple[int, int, int, int]: + """ + ONNX pads are (top, bottom, left, right) where top/bottom are H, left/right are W. + + Template unified order: + (${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}) + + Both tiled C kernels expect: + - arg1 (${padding_y_top}) -> pad_top (H_begin) + - arg2 (${padding_y_bottom}) -> pad_bottom (H_end) + - arg3 (${padding_x_left}) -> pad_left (W_begin) + - arg4 (${padding_x_right}) -> pad_right (W_end) + + So the mapping is the identity: padding_y_top=top, padding_x_left=left. + """ + return (tpt, tpb, tpl, tpr) + + @classmethod + def computeDyCubeFromDxTile( + cls, + dxTile: HyperRectangle, # (N,Cin,Hx,Wx) + dyFull: Tuple[int, int, int, int], # full dY + P: int, + Q: int, + pads: Tuple[int, int, int, int], # (t,b,l,r) + strides: Tuple[int, int], # (sh, sw) + dyC: int, # Cout for this op + dxAbsOff: Tuple[int, int, int, int], # abs offset for boundary decision + ) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]: + + (nOff, _cOff, _hxOff_rel, _wxOff_rel) = dxTile.offset + (nSize, _cinSize, hxSize, wxSize) = dxTile.dims + + (_, _, hxOff_abs, wxOff_abs) = dxAbsOff + + pad_top, pad_bottom, pad_left, pad_right = pads + sh, sw = strides + + hx0 = hxOff_abs + hx1 = hxOff_abs + hxSize - 1 + wx0 = wxOff_abs + wx1 = wxOff_abs + wxSize - 1 + + Hy = dyFull[2] + Wy = dyFull[3] + + oy0 = cls._ceil_div(hx0 - (P - 1) + pad_top, sh) + oy1 = cls._floor_div(hx1 + pad_top, sh) + ox0 = cls._ceil_div(wx0 - (Q - 1) + pad_left, sw) + ox1 = cls._floor_div(wx1 + pad_left, sw) + + oy0 = max(0, oy0) + ox0 = max(0, ox0) + oy1 = min(Hy - 1, oy1) + ox1 = min(Wy - 1, ox1) + + if oy0 > oy1 or ox0 > ox1: + raise RuntimeError( + f"dx tile {dxTile.offset}/{dxTile.dims} produces empty dy halo: " + f"oy[{oy0},{oy1}] ox[{ox0},{ox1}] (Hy={Hy},Wy={Wy},P={P},Q={Q},pads={pads},strides={strides})" + ) + + dyH = oy1 - oy0 + 1 + dyW = ox1 - ox0 + 1 + + dyCube = HyperRectangle( + (nOff, 0, oy0, ox0), # dY: (N, C_out, H, W) + (nSize, dyC, dyH, dyW) + ) + + # tile-level ONNX pads only at boundary + tile_pad_top = pad_top if oy0 == 0 else 0 + tile_pad_bottom = pad_bottom if (oy0 + dyH) == Hy else 0 + tile_pad_left = pad_left if ox0 == 0 else 0 + tile_pad_right = pad_right if (ox0 + dyW) == Wy else 0 + + return dyCube, (tile_pad_top, tile_pad_bottom, tile_pad_left, tile_pad_right) + + @staticmethod + def _get_abs_off(abs_obj: AbsoluteHyperRectangle, fallback_rect: HyperRectangle): + abs_off = getattr(abs_obj, "absoluteOffset", None) + if abs_off is None: + abs_off = getattr(abs_obj, "absolute_offset", None) + if abs_off is None: + abs_off = fallback_rect.offset + return abs_off + + @classmethod + def extraSerializeChecks(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> None: + """Hook for DW checks etc.""" + return + + # --------------------------------------------------- + # 4) serialize: dx tiles -> dy halo tiles + # --------------------------------------------------- + @classmethod + def serializeTilingSolution( + cls, + tilingSolution: NodeMemoryConstraint, + absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation + ) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + cls.extraSerializeChecks(ctxt, operatorRepresentation) + + varDY = operatorRepresentation[cls.gradOutKey] # dY + varW = operatorRepresentation[cls.weightKey] # W + varDX = operatorRepresentation[cls.gradInKey] # dX + + _pads = list(operatorRepresentation.get("pads", [0, 0, 0, 0])) # ONNX: [H_begin, W_begin, H_end, W_end] + pads = (_pads[0], _pads[2], _pads[1], _pads[3]) # reorder to (top, bottom, left, right) + strides = tuple(operatorRepresentation.get("strides", [1, 1])) # (sh,sw) + + dyFull = tuple(ctxt.lookup(varDY).shape) # (N,Cout,Ho,Wo) + dxFull = tuple(ctxt.lookup(varDX).shape) # (N,Cin,Hi,Wi) + wShape = tuple(ctxt.lookup(varW).shape) # (Cout,Cin/group,P,Q) or DW: (Cin,1,P,Q) + + P, Q = cls.get_kernel_hw(ctxt, varW, wShape) + dyC = cls.get_dy_channels(ctxt, operatorRepresentation, varDY, varDX, varW, dyFull, dxFull, wShape) + + dxTiles = [c.rectangle for c in absoluteOutputCubes] + + # weight may be a Constant op excluded from the tiling solution + varW_name = operatorRepresentation[cls.weightKey] + weight_in_solution = varW_name in tilingSolution.tensorMemoryConstraints + + addrNames = [cls.gradOutKey] + if weight_in_solution: + addrNames.append(cls.weightKey) + addrNames.append(cls.gradInKey) + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr( + tilingSolution, targetMemLevel, operatorRepresentation, addrNames + ) + + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_in": [], + "ch_im_out": [], + + # unified template order: + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [], + + "offset_grad_in_h": [], + "offset_grad_in_w": [], + "offset_grad_out_h": [], + "offset_grad_out_w": [], + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_in": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t), + + "offset_grad_in_h": PointerClass(uint16_t), + "offset_grad_in_w": PointerClass(uint16_t), + "offset_grad_out_h": PointerClass(uint16_t), + "offset_grad_out_w": PointerClass(uint16_t), + } + + inputDyCubes: List[HyperRectangle] = [] + inputWCubes: List[HyperRectangle] = [] + outputDxCubes: List[HyperRectangle] = [] + + fullW = HyperRectangle((0, 0, 0, 0), wShape) + + ch_in = cls.get_ch_im_in(ctxt, dyFull, dxFull, wShape) + ch_out = cls.get_ch_im_out(ctxt, dyFull, dxFull, wShape) + + for idx, dxCube in enumerate(dxTiles): + abs_off = cls._get_abs_off(absoluteOutputCubes[idx], dxCube) + + dyCube, (tpt, tpb, tpl, tpr) = cls.computeDyCubeFromDxTile( + dxTile=dxCube, + dyFull=dyFull, + P=P, Q=Q, + pads=pads, + strides=strides, + dyC=dyC, # IMPORTANT: use computed dyC + dxAbsOff=abs_off + ) + + replacements["dim_im_in_x"].append(dxCube.dims[2]) # H_in_tile + replacements["dim_im_in_y"].append(dxCube.dims[3]) # W_in_tile + replacements["dim_im_out_x"].append(dyCube.dims[2]) # H_out_tile (halo) + replacements["dim_im_out_y"].append(dyCube.dims[3]) # W_out_tile (halo) + + replacements["ch_im_in"].append(ch_in) + replacements["ch_im_out"].append(ch_out) + + py_top, py_bottom, px_left, px_right = cls.map_onnx_pads_to_template(tpt, tpb, tpl, tpr) + replacements["padding_y_top"].append(py_top) + replacements["padding_y_bottom"].append(py_bottom) + replacements["padding_x_left"].append(px_left) + replacements["padding_x_right"].append(px_right) + + replacements["offset_grad_in_h"].append(abs_off[2]) + replacements["offset_grad_in_w"].append(abs_off[3]) + replacements["offset_grad_out_h"].append(dyCube.offset[2]) + replacements["offset_grad_out_w"].append(dyCube.offset[3]) + + inputDyCubes.append(dyCube) + inputWCubes.append(fullW) + outputDxCubes.append(dxCube) + + if weight_in_solution: + inputLoadSchedule = [{cls.gradOutKey: dy, cls.weightKey: w} for dy, w in zip(inputDyCubes, inputWCubes)] + else: + inputLoadSchedule = [{cls.gradOutKey: dy} for dy in inputDyCubes] + outputLoadSchedule = [{cls.gradInKey: dx} for dx in outputDxCubes] + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + return variableReplacementSchedule, tilingSchedule + + +# ============================================================================ +# ConvGradX: subclass reusing the base +# ============================================================================ + +class ConvGradX2DHWTileConstraint(ConvGradXTileConstraintBase): + pass + + +class ConvGradX2DIm2ColHWTileConstraint(ConvGradXTileConstraintBase): + pass + +class PWConvGradXTileConstraint(ConvGradXTileConstraintBase): + pass + +class DWConvGradX2DTileConstraint(ConvGradXTileConstraintBase): + """ + Depthwise ConvGradX (dX) tiling, reusing ConvGradXTileConstraintBase. + + Expected tensors: + data_in = grad_out (dY) [N, C, H_out, W_out] + data_out = grad_in (dX) [N, C, H_in, W_in] + weight = W [C, 1, P, Q] + """ + + # If your DW parser uses different keys, override here. + gradOutKey = "grad_out" + gradInKey = "grad_in" + weightKey = "weight" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dyName = parseDict[cls.gradOutKey] # dY + dxName = parseDict[cls.gradInKey] # dX + wName = parseDict[cls.weightKey] # W + + tilerModel.addTensorDimToModel(ctxt, dyName) + tilerModel.addTensorDimToModel(ctxt, dxName) + tilerModel.addTensorDimToModel(ctxt, wName) + + # N match + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 0) == tilerModel.getTensorDimVar(dxName, 0)) + + # DW channels: Cin == Cout == W[0], and W[1]==1 + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == tilerModel.getTensorDimVar(wName, 0)) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dxName, 1) == tilerModel.getTensorDimVar(wName, 0)) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 1) == 1) + + return tilerModel + + @classmethod + def addPolicyConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + """ + DW policy: + - keep full channels (C) + - weight not tiled + - enforce W[1]==1 + """ + dyName = parseDict[cls.gradOutKey] + dxName = parseDict[cls.gradInKey] + wName = parseDict[cls.weightKey] + + dyBuf = ctxt.lookup(dyName) + dxBuf = ctxt.lookup(dxName) + wBuf = ctxt.lookup(wName) + + # Full channels for both dY and dX + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == dyBuf.shape[1]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dxName, 1) == dxBuf.shape[1]) + + # Weight not tiled + DW second dim fixed to 1 + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 0) == wBuf.shape[0]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 1) == 1) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 2) == wBuf.shape[2]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(wName, 3) == wBuf.shape[3]) + + return tilerModel + + @classmethod + def extraSerializeChecks(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> None: + varDY = operatorRepresentation[cls.gradOutKey] + varDX = operatorRepresentation[cls.gradInKey] + varW = operatorRepresentation[cls.weightKey] + + dyFull = tuple(ctxt.lookup(varDY).shape) # (N,C,Ho,Wo) + dxFull = tuple(ctxt.lookup(varDX).shape) # (N,C,Hi,Wi) + wShape = tuple(ctxt.lookup(varW).shape) # (C,1,P,Q) + + Cin = dxFull[1] + Cout = dyFull[1] + if Cin != Cout: + raise RuntimeError(f"DWConvGradX expects Cin==Cout, got Cin={Cin}, Cout={Cout}") + if wShape[0] != Cin: + raise RuntimeError(f"DWConvGradX expects W[0]==C, got W[0]={wShape[0]} vs C={Cin}") + if wShape[1] != 1: + raise RuntimeError(f"DWConvGradX expects W[1]==1, got {wShape[1]}") + + @classmethod + def get_dy_channels( + cls, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation, + dyName: str, + dxName: str, + wName: str, + dyFull: Tuple[int, int, int, int], + dxFull: Tuple[int, int, int, int], + wShape: Tuple[int, int, int, int], + ) -> int: + # DW: dY channels is C + return dyFull[1] + +class ConvGradWTileConstraintBase(TileConstraint): + """ + Base for ConvGradW2D tiling (im2col-style): + - tile grad_out (dY) over H/W + - for each dY tile, derive the required input (X) tile (with kernel halo) + - grad_weight (dW) is NOT tiled (accumulation target is full tensor) + - unified template padding naming: + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + where: + x => H dimension (vertical) => top/bottom + y => W dimension (horizontal)=> left/right + """ + + # ---- parser/opRep keys (override if needed) ---- + dataInKey = "data_in" # X (forward input) + gradOutKey = "grad_out" # dY + weightKey = "grad_weight" # dW (output tensor) + + # --------------------------- + # 1) Geometrical constraints + # --------------------------- + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + xName = parseDict[cls.dataInKey] + dyName = parseDict[cls.gradOutKey] + dwName = parseDict[cls.weightKey] + + tilerModel.addTensorDimToModel(ctxt, xName) + tilerModel.addTensorDimToModel(ctxt, dyName) + tilerModel.addTensorDimToModel(ctxt, dwName) + + group = parseDict.get("group", 1) + + # X, dY are NCHW + N_x = tilerModel.getTensorDimVar(xName, 0) + Ci_x = tilerModel.getTensorDimVar(xName, 1) + + N_dy = tilerModel.getTensorDimVar(dyName, 0) + Co_dy = tilerModel.getTensorDimVar(dyName, 1) + + # dW layout (standard): [C_out, C_in_per_group, P, Q] + Co_dw = tilerModel.getTensorDimVar(dwName, 0) + Ci_dw = tilerModel.getTensorDimVar(dwName, 1) + + # batch match + tilerModel.addConstraint(N_x == N_dy) + + # channel relations + tilerModel.addConstraint(Co_dy == Co_dw) + tilerModel.addConstraint(Ci_x == Ci_dw * group) + + return tilerModel + + # ----------------------- + # 2) Policy constraints + # ----------------------- + @classmethod + def addPolicyConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + """ + Default policy: + - keep full Cin/Cout on X and dY + - dW output is full (no tiling) because accumulation + - kernel dims fixed (no tiling) + - allow H/W tiling on dY (and derived halo on X) + """ + xName = parseDict[cls.dataInKey] + dyName = parseDict[cls.gradOutKey] + dwName = parseDict[cls.weightKey] + + xBuf = ctxt.lookup(xName) + dyBuf = ctxt.lookup(dyName) + dwBuf = ctxt.lookup(dwName) + + # Full channels for inputs + tilerModel.addConstraint(tilerModel.getTensorDimVar(xName, 1) == xBuf.shape[1]) # Cin + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == dyBuf.shape[1]) # Cout + + # dW is full (all dims) + for d in range(len(dwBuf.shape)): + tilerModel.addConstraint(tilerModel.getTensorDimVar(dwName, d) == dwBuf.shape[d]) + + # dY tile spatial dims >= 1 + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 2) >= 1) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 3) >= 1) + + return tilerModel + + # ----------------------------------- + # 3) Symbolic node representation + # ----------------------------------- + @classmethod + def constructSymbolicNodeRep( + cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext + ) -> Dict[str, Union[int, IntVar]]: + """ + Template bindings (matches your new template style for ConvGradW): + - dim_im_out_* / ch_im_out : for grad_out (dY) + - dim_im_in_* / ch_im_in : for input (X) + - dim_kernel_* : from dW tensor + - padding_* : unified naming + """ + xName = parseDict[cls.dataInKey] + dyName = parseDict[cls.gradOutKey] + dwName = parseDict[cls.weightKey] + + symbolic = parseDict.copy() + + # dY tile + symbolic["dim_im_out_x"] = tilerModel.getTensorDimVar(dyName, 2) # H_out tile + symbolic["dim_im_out_y"] = tilerModel.getTensorDimVar(dyName, 3) # W_out tile + symbolic["ch_im_out"] = tilerModel.getTensorDimVar(dyName, 1) # C_out + + # X tile + symbolic["dim_im_in_x"] = tilerModel.getTensorDimVar(xName, 2) # H_in tile + symbolic["dim_im_in_y"] = tilerModel.getTensorDimVar(xName, 3) # W_in tile + symbolic["ch_im_in"] = tilerModel.getTensorDimVar(xName, 1) # C_in + + # Kernel dims from dW: [C_out, C_in_per_group, P, Q] + symbolic["dim_kernel_x"] = tilerModel.getTensorDimVar(dwName, 2) # P (H) + symbolic["dim_kernel_y"] = tilerModel.getTensorDimVar(dwName, 3) # Q (W) + + return symbolic + + # ------------------------------- + # helpers + # ------------------------------- + @staticmethod + def _ceil_div(a: int, b: int) -> int: + return -((-a) // b) + + @staticmethod + def _floor_div(a: int, b: int) -> int: + return a // b + + @classmethod + def computeInputTileFromGradOutTile( + cls, + kernel_hw: Tuple[int, int], # (P, Q) + pads: Tuple[int, int, int, int], # (t, b, l, r) + strides: Tuple[int, int], # (sh, sw) + inputCSize: int, # Cin (full) + gradOutTile: HyperRectangle, # dY tile (N, Cout, Ho_t, Wo_t) + inputFull: Tuple[int, int, int, int], # X full (N, Cin, Hi, Wi) + gradOutFull: Tuple[int, int, int, int], # dY full (N, Cout, Ho, Wo) + ) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]: + """ + Given dY tile offsets, compute required X tile: + h_in in [h_out*sh - pad_top, h_out*sh - pad_top + P) + w_in in [w_out*sw - pad_left, w_out*sw - pad_left + Q) + """ + (nOff, _cOff, hoOff, woOff) = gradOutTile.offset + (nSize, _cSize, hoSize, woSize) = gradOutTile.dims + + pad_top, pad_bottom, pad_left, pad_right = pads + sh, sw = strides + P, Q = kernel_hw + + h_in_start = hoOff * sh - pad_top + w_in_start = woOff * sw - pad_left + + h_in_end = (hoOff + hoSize - 1) * sh - pad_top + P + w_in_end = (woOff + woSize - 1) * sw - pad_left + Q + + # clamp to X valid range + h_in_start_c = max(0, h_in_start) + w_in_start_c = max(0, w_in_start) + h_in_end_c = min(inputFull[2], h_in_end) + w_in_end_c = min(inputFull[3], w_in_end) + + hiSize = max(1, h_in_end_c - h_in_start_c) + wiSize = max(1, w_in_end_c - w_in_start_c) + + xTile = HyperRectangle( + (nOff, 0, h_in_start_c, w_in_start_c), + (nSize, inputCSize, hiSize, wiSize), + ) + + # ONNX pads apply only on boundary tiles of dY space + Hy = gradOutFull[2] + Wy = gradOutFull[3] + + tile_pad_top = pad_top if hoOff == 0 else 0 + tile_pad_bottom = pad_bottom if (hoOff + hoSize) == Hy else 0 + tile_pad_left = pad_left if woOff == 0 else 0 + tile_pad_right = pad_right if (woOff + woSize) == Wy else 0 + + return xTile, (tile_pad_top, tile_pad_bottom, tile_pad_left, tile_pad_right) + + @classmethod + def extraSerializeChecks(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> None: + """Hook for DW checks etc.""" + return + + # --------------------------------------------------- + # 4) serialize: dY tiles -> X halo tiles, dW full + # --------------------------------------------------- + @classmethod + def serializeTilingSolution( + cls, + tilingSolution: NodeMemoryConstraint, + absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation, + ) -> Tuple[VariableReplacementScheme, TilingSchedule]: + cls.extraSerializeChecks(ctxt, operatorRepresentation) + + xName = operatorRepresentation[cls.dataInKey] + dyName = operatorRepresentation[cls.gradOutKey] + dwName = operatorRepresentation[cls.weightKey] + + _pads = list(operatorRepresentation.get("pads", [0, 0, 0, 0])) # ONNX: [H_begin, W_begin, H_end, W_end] + pads = (_pads[0], _pads[2], _pads[1], _pads[3]) # reorder to (top, bottom, left, right) + strides = tuple(operatorRepresentation.get("strides", [1, 1])) # (sh,sw) + + xFull = tuple(ctxt.lookup(xName).shape) # (N,Cin,Hi,Wi) + dyFull = tuple(ctxt.lookup(dyName).shape) # (N,Cout,Ho,Wo) + dwShape = tuple(ctxt.lookup(dwName).shape) # standard: (Cout,Cin_per_group,P,Q) + + # Use the tiler-computed dY tile shape at this mem level + # (if missing, fall back to full dy) + try: + dyTileShape = tilingSolution.tensorMemoryConstraints[dyName].memoryConstraints[targetMemLevel].shape + except Exception: + dyTileShape = dyFull + + N_tile = dyTileShape[0] + Ho_tile_max = dyTileShape[2] + Wo_tile_max = dyTileShape[3] + + # Generate (ho,wo) tiles covering full dY spatial dims + Ho_full = dyFull[2] + Wo_full = dyFull[3] + + h_tiles: List[Tuple[int, int]] = [] + w_tiles: List[Tuple[int, int]] = [] + + ho = 0 + while ho < Ho_full: + hs = min(Ho_tile_max, Ho_full - ho) + h_tiles.append((ho, hs)) + ho += hs + + wo = 0 + while wo < Wo_full: + ws = min(Wo_tile_max, Wo_full - wo) + w_tiles.append((wo, ws)) + wo += ws + + # Base addrs: inputs are X + dY, output is dW + addrNames = [cls.dataInKey, cls.gradOutKey, cls.weightKey] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr( + tilingSolution, targetMemLevel, operatorRepresentation, addrNames + ) + + # Unified template naming + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_in": [], + "ch_im_out": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [], + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_in": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t), + } + + Cin_full = xFull[1] + Cout_full = dyFull[1] + + # dW is full cube (accumulation target) + fullDW = HyperRectangle((0, 0, 0, 0), dwShape) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Build tiles + for hoOff, hoSz in h_tiles: + for woOff, woSz in w_tiles: + dyTile = HyperRectangle( + (0, 0, hoOff, woOff), + (N_tile, Cout_full, hoSz, woSz), + ) + + xTile, (tpt, tpb, tpl, tpr) = cls.computeInputTileFromGradOutTile( + kernel_hw=(dwShape[2], dwShape[3]), + pads=pads, + strides=strides, + inputCSize=Cin_full, + gradOutTile=dyTile, + inputFull=xFull, + gradOutFull=dyFull, + ) + + # dims (x=H, y=W) + replacements["dim_im_in_x"].append(xTile.dims[2]) + replacements["dim_im_in_y"].append(xTile.dims[3]) + replacements["dim_im_out_x"].append(dyTile.dims[2]) + replacements["dim_im_out_y"].append(dyTile.dims[3]) + + replacements["ch_im_in"].append(Cin_full) + replacements["ch_im_out"].append(Cout_full) + + # ONNX pads (t,b,l,r) -> unified naming: + # padding_y_top/bottom : H dimension => top/bottom + # padding_x_left/right : W dimension => left/right + replacements["padding_y_top"].append(tpt) # H_begin = top + replacements["padding_y_bottom"].append(tpb) # H_end = bottom + replacements["padding_x_left"].append(tpl) # W_begin = left + replacements["padding_x_right"].append(tpr) # W_end = right + + inputLoadSchedule.append({cls.dataInKey: xTile, cls.gradOutKey: dyTile}) + outputLoadSchedule.append({cls.weightKey: fullDW}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + return variableReplacementSchedule, tilingSchedule + + +class ConvGradW2DTileConstraint(ConvGradWTileConstraintBase): + """Standard ConvGradW2D (non-depthwise).""" + pass + +class PWConvGradWTileConstraint(ConvGradWTileConstraintBase): + """Pointwise ConvGradW (1x1 kernel).""" + pass + +class ConvGradBTileConstraint(TileConstraint): + """ + TileConstraint for ConvGradB: dB[c] = sum_{n,h,w} dY[n,c,h,w] + + Tiles along C (output channels). N, H, W are kept full (reduction dims). + Input: grad_out (dY) [N, C, H, W] — load C-slice per tile + Output: grad_bias (dB) [C] — write C-slice per tile + """ + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dyName = parseDict['grad_out'] + dbName = parseDict['grad_bias'] + + tilerModel.addTensorDimToModel(ctxt, dyName) + tilerModel.addTensorDimToModel(ctxt, dbName) + + dyBuf = ctxt.lookup(dyName) + N, C, H, W = dyBuf.shape[0], dyBuf.shape[1], dyBuf.shape[2], dyBuf.shape[3] + + # C must match between dY and dB + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == tilerModel.getTensorDimVar(dbName, 0)) + + # Keep N, H, W full (reduction dims — cannot split without atomics) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 2) == H) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 3) == W) + + return tilerModel + + @classmethod + def constructSymbolicNodeRep(cls, tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict: + dyName = parseDict['grad_out'] + dyBuf = ctxt.lookup(dyName) + N, H, W = dyBuf.shape[0], dyBuf.shape[2], dyBuf.shape[3] + + symbolic = parseDict.copy() + symbolic['ch_im_out'] = tilerModel.getTensorDimVar(dyName, 1) + symbolic['batch'] = N + symbolic['dim_im_out_x'] = H + symbolic['dim_im_out_y'] = W + return symbolic + + @classmethod + def serializeTilingSolution( + cls, + tilingSolution: NodeMemoryConstraint, + absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation, + ) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + dyName = operatorRepresentation['grad_out'] + dyBuf = ctxt.lookup(dyName) + N, H, W = dyBuf.shape[0], dyBuf.shape[2], dyBuf.shape[3] + + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr( + tilingSolution, targetMemLevel, operatorRepresentation, ['grad_out', 'grad_bias'] + ) + + replacements: Dict[str, List[int]] = {'ch_im_out': []} + replacementTypes = {'ch_im_out': PointerClass(uint16_t)} + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for absOut in absoluteOutputCubes: + dbTile = absOut.rectangle # 1D: offset=(c_off,), dims=(c_size,) + c_off = dbTile.offset[0] + c_size = dbTile.dims[0] + + dyTile = HyperRectangle((0, c_off, 0, 0), (N, c_size, H, W)) + + replacements['ch_im_out'].append(c_size) + inputLoadSchedule.append({'grad_out': dyTile}) + outputLoadSchedule.append({'grad_bias': dbTile}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + return variableReplacementSchedule, tilingSchedule + + +class DWConvGradW2DTileConstraint(ConvGradWTileConstraintBase): + """ + Depthwise ConvGradW: + - X: [N, C, Hi, Wi] + - dY: [N, C, Ho, Wo] (Cout == Cin == C) + - dW: [C, 1, P, Q] + """ + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + xName = parseDict[cls.dataInKey] + dyName = parseDict[cls.gradOutKey] + dwName = parseDict[cls.weightKey] + + tilerModel.addTensorDimToModel(ctxt, xName) + tilerModel.addTensorDimToModel(ctxt, dyName) + tilerModel.addTensorDimToModel(ctxt, dwName) + + # N match + tilerModel.addConstraint(tilerModel.getTensorDimVar(xName, 0) == tilerModel.getTensorDimVar(dyName, 0)) + + # DW dW layout: [C, 1, P, Q] + C_dw = tilerModel.getTensorDimVar(dwName, 0) + Cpg_dw = tilerModel.getTensorDimVar(dwName, 1) + + # X and dY channels are both C + tilerModel.addConstraint(tilerModel.getTensorDimVar(xName, 1) == C_dw) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == C_dw) + + # Cin_per_group must be 1 + tilerModel.addConstraint(Cpg_dw == 1) + + return tilerModel + + @classmethod + def addPolicyConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Reuse base policy but also enforce DW-specific invariants tightly + xName = parseDict[cls.dataInKey] + dyName = parseDict[cls.gradOutKey] + dwName = parseDict[cls.weightKey] + + xBuf = ctxt.lookup(xName) + dyBuf = ctxt.lookup(dyName) + dwBuf = ctxt.lookup(dwName) + + # full channels + tilerModel.addConstraint(tilerModel.getTensorDimVar(xName, 1) == xBuf.shape[1]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 1) == dyBuf.shape[1]) + + # DW invariants: Cin == Cout == dwBuf.shape[0], dwBuf.shape[1] == 1 + tilerModel.addConstraint(tilerModel.getTensorDimVar(dwName, 0) == xBuf.shape[1]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dwName, 0) == dyBuf.shape[1]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dwName, 1) == 1) + + # dW full (no tiling) + for d in range(len(dwBuf.shape)): + tilerModel.addConstraint(tilerModel.getTensorDimVar(dwName, d) == dwBuf.shape[d]) + + # dY tile spatial dims >= 1 + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 2) >= 1) + tilerModel.addConstraint(tilerModel.getTensorDimVar(dyName, 3) >= 1) + + return tilerModel + + @classmethod + def extraSerializeChecks(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> None: + xName = operatorRepresentation[cls.dataInKey] + dyName = operatorRepresentation[cls.gradOutKey] + dwName = operatorRepresentation[cls.weightKey] + + xFull = tuple(ctxt.lookup(xName).shape) + dyFull = tuple(ctxt.lookup(dyName).shape) + dwShape = tuple(ctxt.lookup(dwName).shape) + + Cin = xFull[1] + Cout = dyFull[1] + assert Cin == Cout, f"DWConvGradW expects Cin==Cout, got Cin={Cin}, Cout={Cout}" + assert dwShape[0] == Cin, f"DWConvGradW expects dW[0]==C, got dW[0]={dwShape[0]} vs C={Cin}" + assert dwShape[1] == 1, f"DWConvGradW expects dW[1]==1, got dW[1]={dwShape[1]}" + return diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GlobalAveragePoolTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GlobalAveragePoolTileConstraint.py new file mode 100644 index 0000000000..c8416f47e2 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/GlobalAveragePoolTileConstraint.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class GlobalAveragePoolTileConstraint(TileConstraint): + """Tile constraint for GlobalAveragePool (NCHW). + + Input: data_in [N, C, H, W] + Output: data_out [N, C, 1, 1] (N*C elements) + + Tiling strategy: tile along C (channels are independent). + - N, H, W are pinned to full: GAP needs all spatial elements per channel. + - C is free: each channel's mean is computed independently. + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + data_in_name = parseDict['data_in'] + data_out_name = parseDict['data_out'] + + for name in [data_in_name, data_out_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + input_shape = ctxt.lookup(data_in_name).shape + N = input_shape[0] + H = input_shape[2] + W = input_shape[3] + + # Pin N, H, W to full + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=2) == H) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=3) == W) + + # Output shape [N, C, 1, 1]: dim 0 = N (pinned), dim 1 = C (free) + # data_out is stored as [N, C] effectively; tilerModel sees it as 4D [N,C,1,1] + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=data_out_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=0)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=data_out_name, dimIdx=1) == + tilerModel.getTensorDimVar(tensorName=data_in_name, dimIdx=1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + addr_names = ['data_in', 'data_out'] + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"channels": []} + replacement_types = {"channels": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + input_shape = ctxt.lookup(operatorRepresentation['data_in']).shape + H = input_shape[2] + W = input_shape[3] + + for cube in output_cubes: + # cube is the tile of data_out: [N, C_tile, 1, 1] + C_tile = cube.dims[1] + c_start = cube.offset[1] + + replacements["channels"].append(C_tile) + + # Input tile: [N, C_tile, H, W] + in_cube = HyperRectangle( + (cube.offset[0], c_start, 0, 0), + (cube.dims[0], C_tile, H, W) + ) + input_load_schedule.append({"data_in": in_cube}) + output_load_schedule.append({"data_out": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule + + +class GlobalAveragePoolGradTileConstraint(TileConstraint): + """Tile constraint for GlobalAveragePoolGrad (NCHW). + + Input: dY [N, C, 1, 1] (N*C elements) + Output: dX [N, C, H, W] + + Tiling strategy: tile along C (channels are independent). + - N, H, W are pinned to full. + - C is free. + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + dY_name = parseDict['dY'] + dX_name = parseDict['dX'] + + for name in [dY_name, dX_name]: + tilerModel.addTensorDimToModel(ctxt, name) + + output_shape = ctxt.lookup(dX_name).shape + N = output_shape[0] + H = output_shape[2] + W = output_shape[3] + + # Pin N, H, W to full + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=0) == N) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=2) == H) + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=3) == W) + + # dY [N, C, 1, 1]: N pinned, C follows dX C + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=0) == + tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=0)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=dY_name, dimIdx=1) == + tilerModel.getTensorDimVar(tensorName=dX_name, dimIdx=1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + output_cubes = [cube.rectangle for cube in absoluteOutputCubes] + + addr_names = ['dY', 'dX'] + input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addr_names) + + replacements = {"channels": []} + replacement_types = {"channels": PointerClass(uint16_t)} + + input_load_schedule = [] + output_load_schedule = [] + + for cube in output_cubes: + # cube is the tile of dX: [N, C_tile, H, W] + C_tile = cube.dims[1] + c_start = cube.offset[1] + + replacements["channels"].append(C_tile) + + # dY tile: [N, C_tile, 1, 1] + dy_cube = HyperRectangle( + (cube.offset[0], c_start, 0, 0), + (cube.dims[0], C_tile, 1, 1) + ) + input_load_schedule.append({"dY": dy_cube}) + output_load_schedule.append({"dX": cube}) + + tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule, + output_load_schedule) + variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types) + return variable_replacement_schedule, tiling_schedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py new file mode 100644 index 0000000000..2d3cfa4c3e --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/InPlaceAccumulatorV2TileConstraint.py @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class InPlaceAccumulatorV2TileConstraint(BOPTileConstraint): + """Tile constraint for InPlaceAccumulatorV2. + + Tiles buffer and gradient together (same shape); lazy_reset_grad is a + scalar (1 element) and is not tiled. + """ + + dataIn1Name = 'accum_buffer' + dataIn2Name = 'gradient' + dataOutName = 'data_out' + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register buffer, gradient, data_out and add BOP equality constraints + tilerModel = super().addGeometricalConstraint(tilerModel, parseDict, ctxt) + + # Register lazy_reset_grad (scalar flag, not tiled): fix all dims to full size + lazyResetName = parseDict['lazy_reset_grad'] + tilerModel.addTensorDimToModel(ctxt, lazyResetName) + lazyResetTensor = ctxt.lookup(lazyResetName) + shape = lazyResetTensor.shape + dims = [shape] if isinstance(shape, int) else shape + for idx, dim in enumerate(dims): + dimVar = tilerModel.getTensorDimVar(lazyResetName, idx) + tilerModel.addConstraint(dimVar == dim) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Egress strategy: use data_out (the proper graph output, present in + # outputTensorMemoryConstraints) rather than accum_buffer (a graph input, + # only in inputTensorMemoryConstraints). This avoids two core-class issues: + # 1. accum_buffer appearing in BOTH inputBaseOffsets and outputBaseOffsets + # causes a duplicate-hoist KeyError in TilingVariableReplacement. + # 2. The egress DMA lookup uses outputTensorMemoryConstraints; accum_buffer + # is not there and would raise a KeyError. + # + # The trick: force outputBaseOffsets[data_out] to the SAME L1 arena offset as + # inputBaseOffsets[accum_buffer]. Both data_out_ref and accum_buffer_ref then + # map to the same physical L1 address. The tiled kernel writes to ${accum_buffer} + # (= accum_buffer_ref in L1); the egress DMA transfers data_out_ref (same L1 + # bytes) to data_out's L2 address, which is what the optimizer reads. + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName, 'lazy_reset_grad'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # Pin data_out's L1 tile to the same arena slot as accum_buffer's L1 tile. + outputBaseOffsets[cls.dataOutName] = inputBaseOffsets[cls.dataIn1Name] + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + lazyResetName = operatorRepresentation['lazy_reset_grad'] + lazyResetShape = ctxt.lookup(lazyResetName).shape + lazyResetDims = (lazyResetShape,) if isinstance(lazyResetShape, int) else tuple(lazyResetShape) + lazyResetCube = HyperRectangle((0,) * len(lazyResetDims), lazyResetDims) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + replacements["size"].append(int(np.prod(cube.dims))) + inputLoadSchedule.append({ + cls.dataIn1Name: cube, + cls.dataIn2Name: cube, + 'lazy_reset_grad': lazyResetCube, + }) + + for out in outputCubes: + # Egress: DMA from data_out_ref (same L1 slot as accum_buffer_ref) → data_out L2. + outputLoadSchedule.append({ + cls.dataOutName: out, + }) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py index c3593ee6f0..4eb67020e6 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import copy from typing import Dict, List, Tuple import numpy as np @@ -46,8 +47,56 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx) == tilerModel.getTensorDimVar( tensorName = outputBufferName, dimIdx = idx)) + # Register mean/inv_std_dev (secondary outputs, shape = inputShape[:-1]) + # They tile along all dims except features, so constrain them to match data_in. + for secondary in ['mean', 'inv_std_dev']: + secondary_name = parseDict.get(secondary, '') + if secondary_name: + tilerModel.addTensorDimToModel(ctxt, secondary_name) + for idx in range(len(inputShape) - 1): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = secondary_name, dimIdx = idx) == + tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx)) + return tilerModel + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + dataOutVar = operatorRepresentation['data_out'] + + # Build a single-output copy to bypass the base-class assertion + # that len(outputTensorMemoryConstraints) == 1. + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + dataOutVar: tilingSolution.outputTensorMemoryConstraints[dataOutVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + # Extend each tiling schedule to include mean and inv_std_dev outputs. + # Their tile rectangles are derived from data_out by dropping the features dim. + for secondary in ['mean', 'inv_std_dev']: + secondaryVar = operatorRepresentation.get(secondary, '') + if not secondaryVar: + continue + if secondaryVar not in tilingSolution.outputTensorMemoryConstraints: + continue + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr == [None]: + continue + for schedule in tilingSchedules: + schedule.outputBaseOffsets[secondary] = addr + for step in schedule.outputLoadSchedule: + data_out_rect = step['data_out'] + # mean/inv_std_dev: drop the last (features) dim from data_out tile + step[secondary] = HyperRectangle(data_out_rect.offset[:-1], data_out_rect.dims[:-1]) + + return varReplacement, tilingSchedules + @classmethod def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], @@ -87,12 +136,9 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw grad_in_buffer_name = parseDict['grad_in'] data_in_buffer_name = parseDict['data_in'] weight_buffer_name = parseDict['weight'] - bias_buffer_name = parseDict['bias'] grad_out_buffer_name = parseDict['grad_out'] - for buffer_name in [ - grad_in_buffer_name, data_in_buffer_name, weight_buffer_name, bias_buffer_name, grad_out_buffer_name - ]: + for buffer_name in [grad_in_buffer_name, data_in_buffer_name, weight_buffer_name, grad_out_buffer_name]: tilerModel.addTensorDimToModel(ctxt, buffer_name) input_shape = ctxt.lookup(data_in_buffer_name).shape @@ -106,10 +152,6 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = last_dim_idx) == tilerModel.getTensorDimVar(tensorName = weight_buffer_name, dimIdx = 0)) - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = last_dim_idx) == - tilerModel.getTensorDimVar(tensorName = bias_buffer_name, dimIdx = 0)) - for idx, dim in enumerate(input_shape): tilerModel.addConstraint( tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = idx) == @@ -120,8 +162,73 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = idx) == tilerModel.getTensorDimVar(tensorName = grad_out_buffer_name, dimIdx = idx)) + # Register mean/inv_std_dev inputs (shape = inputShape[:-1]). + for secondary in ['mean', 'inv_std_dev']: + secondary_name = parseDict.get(secondary, '') + if secondary_name: + tilerModel.addTensorDimToModel(ctxt, secondary_name) + for idx in range(len(input_shape) - 1): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = secondary_name, dimIdx = idx) == + tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = idx)) + + # Register weight_grad/bias_grad (secondary outputs, shape = [features]). + # Their single dimension (features) is already pinned to full size via last_dim_len above. + for secondary in ['weight_grad', 'bias_grad']: + secondary_name = parseDict.get(secondary, '') + if secondary_name: + tilerModel.addTensorDimToModel(ctxt, secondary_name) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = secondary_name, dimIdx = 0) == last_dim_len) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Only pin the feature (last) dim — already done in addGeometricalConstraint. + # Seq dims are left free so the solver can tile along the sequence dimension. + # weight_grad/bias_grad accumulation across seq tiles is handled in the template + # via a static-flag memset + inline accumulation loop (ConvGradW pattern). return tilerModel + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + gradOutVar = operatorRepresentation['grad_out'] + + # Build a single-output copy to bypass the base-class assertion + # that len(outputTensorMemoryConstraints) == 1. + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + gradOutVar: tilingSolution.outputTensorMemoryConstraints[gradOutVar] + } + + varReplacement, tilingSchedules = super().wrapTilingSolution(singleOutputSolution, targetMemLevel, ctxt, + operatorRepresentation) + + # Extend each tiling schedule to include weight_grad and bias_grad outputs. + # Since batch is pinned to full size (addPolicyConstraint), there is effectively + # one tile step and these are always full-size tensors. + for secondary in ['weight_grad', 'bias_grad']: + secondaryVar = operatorRepresentation.get(secondary, '') + if not secondaryVar: + continue + if secondaryVar not in tilingSolution.outputTensorMemoryConstraints: + continue + addr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, secondaryVar) + if addr == [None]: + continue + buf = ctxt.lookup(secondaryVar) + full_rect = HyperRectangle((0,) * len(buf.shape), tuple(buf.shape)) + for schedule in tilingSchedules: + schedule.outputBaseOffsets[secondary] = addr + for step in schedule.outputLoadSchedule: + step[secondary] = full_rect + + return varReplacement, tilingSchedules + @classmethod def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], @@ -129,7 +236,7 @@ def serializeTilingSolution( operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: output_cubes = [cube.rectangle for cube in absoluteOutputCubes] - addr_names = ['grad_in', 'data_in', 'weight', 'bias', 'grad_out'] + addr_names = ['grad_in', 'data_in', 'weight', 'mean', 'inv_std_dev', 'grad_out'] input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation, addr_names) @@ -144,11 +251,20 @@ def serializeTilingSolution( replacements["size"].append(new_size) feature_size = cube.dims[-1] + seq_dims = cube.dims[:-1] + seq_offset = cube.offset[:-1] if len(cube.offset) > 1 else (0,) weight_cube = HyperRectangle((0,), (feature_size,)) - bias_cube = HyperRectangle((0,), (feature_size,)) - - input_load_schedule.append({"grad_in": cube, "data_in": cube, "weight": weight_cube, "bias": bias_cube}) + mean_cube = HyperRectangle(seq_offset, seq_dims) + inv_std_dev_cube = HyperRectangle(seq_offset, seq_dims) + + input_load_schedule.append({ + "grad_in": cube, + "data_in": cube, + "weight": weight_cube, + "mean": mean_cube, + "inv_std_dev": inv_std_dev_cube, + }) output_load_schedule.append({"grad_out": cube}) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MSELossTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MSELossTileConstraint.py new file mode 100644 index 0000000000..456893f7d7 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MSELossTileConstraint.py @@ -0,0 +1,158 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme +from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint + + +class MSELossTileConstraint(TileConstraint): + """TileConstraint for MSELoss(pred, target) -> scalar loss. + + MSELoss = mean((pred - target)^2) is a global reduction; it cannot be + meaningfully tiled because the normaliser (N) changes with tile size. + All input dimensions are pinned to their full size. + + The output is a 0-d scalar represented as a 1-element DMA transfer. + wrapTilingSolution is overridden to bypass the base-class cube logic which + cannot handle 0-d shape tensors. + """ + + dataIn1Name = 'pred' + dataIn2Name = 'target' + dataOutName = 'loss' + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> TilerModel: + predName = parseDict[cls.dataIn1Name] + targetName = parseDict[cls.dataIn2Name] + # Don't add the scalar loss to the tilerModel — it has 0 dims. + for bufferName in [predName, targetName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + predShape = ctxt.lookup(predName).shape + for dim in range(len(predShape)): + predDim = tilerModel.getTensorDimVar(predName, dim) + targetDim = tilerModel.getTensorDimVar(targetName, dim) + tilerModel.addConstraint(predDim == targetDim) + return tilerModel + + @classmethod + def addPolicyConstraint(cls, tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> TilerModel: + # Pin every dimension to its full size: MSELoss is a global reduction. + predName = parseDict[cls.dataIn1Name] + predBuffer = ctxt.lookup(predName) + for dimIdx, dimLen in enumerate(predBuffer.shape): + dimVar = tilerModel.getTensorDimVar(predName, dimIdx) + tilerModel.addConstraint(dimVar == dimLen) + return tilerModel + + @classmethod + def constructSymbolicNodeRep(cls, tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict: + # num_elements is constant (all dims pinned to full size). + return parseDict.copy() + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, + absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + addrNames = [cls.dataIn1Name, cls.dataIn2Name] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr( + tilingSolution, targetMemLevel, operatorRepresentation, addrNames) + + # Add scalar loss output address if available at this memory level. + lossName = operatorRepresentation[cls.dataOutName] + lossAddr = cls.getBaseAddr(tilingSolution, targetMemLevel, lossName) + if lossAddr != [None]: + outputBaseOffsets[cls.dataOutName] = lossAddr + + # Load the full pred / target tensors in one DMA. + predName = operatorRepresentation[cls.dataIn1Name] + predBuffer = ctxt.lookup(predName) + fullCube = HyperRectangle((0,) * len(predBuffer.shape), tuple(predBuffer.shape)) + + num_elements = operatorRepresentation['num_elements'] + replacements = {'num_elements': [num_elements]} + replacementTypes = {'num_elements': PointerClass(uint16_t)} + + inputLoadSchedule = [{cls.dataIn1Name: fullCube, cls.dataIn2Name: fullCube}] + lossRect = HyperRectangle((0,), (1,)) + outputLoadSchedule = [{cls.dataOutName: lossRect}] + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, + inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + return variableReplacementSchedule, tilingSchedule + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + # The single output is a 0-d scalar. The base-class wrapTilingSolution + # would crash building HyperRectangle(shape=[]). We bypass it and call + # serializeTilingSolution directly with a dummy 1-element scalar rect. + scalarRect = AbsoluteHyperRectangle(HyperRectangle((0,), (1,)), (0,)) + varReplacement, tilingSchedule = cls.serializeTilingSolution( + tilingSolution, [scalarRect], targetMemLevel, ctxt, operatorRepresentation) + tilingSchedule = cls.sanitizeTilingSchedule(tilingSchedule) + return varReplacement, [tilingSchedule] + + +class MSELossGradTileConstraint(BOPTileConstraint): + """TileConstraint for MSELossGrad(pred, target) -> grad. + + The gradient grad[i] = 2*(pred[i]-target[i])/N is element-wise — the same + tiling works as any binary element-wise op (BOPTileConstraint). + We only swap the replacement key from 'size' to 'num_elements' to match + the variable name used in the MSELossGrad C template. + """ + + dataIn1Name = 'pred' + dataIn2Name = 'target' + dataOutName = 'grad' + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, + absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + import numpy as np + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr( + tilingSolution, targetMemLevel, operatorRepresentation, addrNames) + + replacements = {'num_elements': []} + replacementTypes = {'num_elements': PointerClass(uint16_t)} + + for cube in outputCubes: + replacements['num_elements'].append(int(np.prod(cube.dims))) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + outputLoadSchedule.append({cls.dataOutName: cube}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, + inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolGradTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolGradTileConstraint.py new file mode 100644 index 0000000000..9a7d38c272 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolGradTileConstraint.py @@ -0,0 +1,129 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class MaxPoolGradCTileConstraint(TileConstraint): + """Channel-tiling constraint for MaxPoolGrad. + + Tiles the channel dimension (last dim in HWC format) across all three tensors: + - data_in (grad_output): [N, Ho, Wo, C] + - x_in (original_input):[N, Hi, Wi, C] + - data_out (grad_input): [N, Hi, Wi, C] + All spatial dimensions are kept at their full size. + """ + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + gradOutName = parseDict['data_in'] + xInName = parseDict['x_in'] + gradInName = parseDict['data_out'] + + for bufferName in [gradOutName, xInName, gradInName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + numDims = len(ctxt.lookup(gradOutName).shape) + + # All three tensors share the same channel tile size (last dim in HWC) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=gradInName, dimIdx=numDims - 1) == + tilerModel.getTensorDimVar(tensorName=gradOutName, dimIdx=numDims - 1)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=xInName, dimIdx=numDims - 1) == + tilerModel.getTensorDimVar(tensorName=gradOutName, dimIdx=numDims - 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + gradOutName = parseDict['data_in'] + xInName = parseDict['x_in'] + gradInName = parseDict['data_out'] + + numDims = len(ctxt.lookup(gradOutName).shape) + + # Fix all dimensions except the channel dimension (last) for all three tensors + for bufferName in [gradOutName, xInName, gradInName]: + buf_shape = ctxt.lookup(bufferName).shape + for idx in range(numDims - 1): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName=bufferName, dimIdx=idx) == buf_shape[idx]) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # x_in may or may not be in the tiling solution (if it is a global buffer it is excluded) + x_in_name = operatorRepresentation['x_in'] + x_in_in_solution = x_in_name in tilingSolution.tensorMemoryConstraints + if x_in_in_solution: + xInBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation, ['x_in']) + inputBaseOffsets.update(xInBaseOffsets) + + gradOutShape = ctxt.lookup(operatorRepresentation['data_in']).shape + gradInShape = ctxt.lookup(operatorRepresentation['data_out']).shape + xInShape = ctxt.lookup(x_in_name).shape + + numDims = len(gradOutShape) + + replacementTypes = {} + replacements: Dict[str, List[int]] = {} + replacementTypes["ch_im_in"] = PointerClass(uint16_t) + replacements["ch_im_in"] = [] + + inputInCubes = [] + xInCubes = [] + + for cube in outputCubes: + ch_tile = cube.dims[-1] + + # grad_output tile: same channel slice, full spatial dims + grad_out_dims = list(gradOutShape) + grad_out_dims[-1] = ch_tile + grad_out_offset = list(cube.offset[:-1]) + [cube.offset[-1]] + inputInCubes.append(HyperRectangle(tuple(grad_out_offset), tuple(grad_out_dims))) + + # x_in tile: same channel slice, full spatial dims + x_in_dims = list(xInShape) + x_in_dims[-1] = ch_tile + x_in_offset = [0] * (numDims - 1) + [cube.offset[-1]] + xInCubes.append(HyperRectangle(tuple(x_in_offset), tuple(x_in_dims))) + + replacements["ch_im_in"].append(ch_tile) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for grad_out_cube, x_in_cube in zip(inputInCubes, xInCubes): + entry = {"data_in": grad_out_cube} + if x_in_in_solution: + entry["x_in"] = x_in_cube + inputLoadSchedule.append(entry) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py index b7757786e1..ebef4910ca 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py @@ -10,3 +10,9 @@ class SGDTileConstraint(BOPTileConstraint): dataIn1Name = 'weight' dataIn2Name = 'grad' dataOutName = 'weight_updated' + +class ReluGradTileConstraint(BOPTileConstraint): + + dataIn1Name = 'grad_out' + dataIn2Name = 'data_in' + dataOutName = 'grad_in' diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py index 5309300659..f2b4ec2d3c 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py @@ -51,7 +51,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw axIndex = list(sliceAxes).index(idx) axStep = sliceSteps[axIndex] - tilerModel.addConstraint(inputDimensionVar == ((outputDimensionVar - 1) * axStep + 1)) + tilerModel.addConstraint(inputDimensionVar == ((outputDimensionVar - 1) * int(axStep) + 1)) else: # Otherwise, input and output dimensions need to be equal tilerModel.addConstraint(outputDimensionVar == inputDimensionVar) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyLossDualOutputTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyLossDualOutputTileConstraint.py new file mode 100644 index 0000000000..3456632b79 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyLossDualOutputTileConstraint.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Dict, List, Tuple, Union + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme +from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ + SoftmaxCrossEntropyTileConstraint + + +class SoftmaxCrossEntropyLossDualOutputTileConstraint(SoftmaxCrossEntropyTileConstraint): + """TileConstraint for SoftmaxCrossEntropyLoss with 2 outputs: + - log_prob : [batch, num_classes] (primary output — same as single-output version) + - loss : [] 0-d scalar (scalar cross-entropy mean) + + Both batch and num_classes are pinned to their full size by the inherited + addPolicyConstraint, so no actual tiling of SCE occurs. The sole purpose of + this subclass is to override wrapTilingSolution so that the base-class + single-output assertion is bypassed, and the scalar loss buffer is included + in the DMA output schedule. + """ + + # Key in operatorRepresentation for the scalar loss output buffer name. + dataLossName = 'loss' + + @classmethod + def wrapTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]: + + logProbVar = operatorRepresentation[cls.dataOutName] # e.g. "onnx::log_prob::3" + lossVar = operatorRepresentation.get(cls.dataLossName, '') + + # If loss is absent (empty string — single-output fallback) or not in the + # memory constraint dict, delegate straight to the parent unchanged. + if not lossVar or lossVar not in tilingSolution.outputTensorMemoryConstraints: + return super().wrapTilingSolution(tilingSolution, targetMemLevel, ctxt, operatorRepresentation) + + # Build a single-output copy of tilingSolution (log_prob only) so that + # the base-class assertion `len(outputTensorMemoryConstraints) == 1` passes. + singleOutputSolution = copy.deepcopy(tilingSolution) + singleOutputSolution.outputTensorMemoryConstraints = { + logProbVar: tilingSolution.outputTensorMemoryConstraints[logProbVar] + } + + # Call the base-class wrapTilingSolution, which runs cube computation and + # calls serializeTilingSolution for log_prob. + varReplacement, tilingSchedules = super().wrapTilingSolution( + singleOutputSolution, targetMemLevel, ctxt, operatorRepresentation) + + # Extend each TilingSchedule to include the scalar loss output. + # The loss tensor is always 1 element (0-d scalar represented as [1] for DMA). + lossAddr = TileConstraint.getBaseAddr(tilingSolution, targetMemLevel, lossVar) + + # If the address is None (IO tensor with runtime-determined address, or tensor + # not allocated at this memory level), skip — same logic as sanitizeTilingSchedule. + if lossAddr == [None]: + return varReplacement, tilingSchedules + + lossRect = HyperRectangle((0,), (1,)) + + for schedule in tilingSchedules: + schedule.outputBaseOffsets[cls.dataLossName] = lossAddr + for step in schedule.outputLoadSchedule: + step[cls.dataLossName] = lossRect + + return varReplacement, tilingSchedules diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 901106459e..fbc47123b3 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -14,15 +14,25 @@ from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint -from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ - PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ - PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \ +from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPAveragePool2DBindings, \ + PULPAveragePoolGrad2DBindings, PULPBatchNormInternalBindings, PULPBatchNormalizationGradBindings, \ + PULPBNGradNormalizeBindings, PULPBNGradReduceBindings, PULPChannelNormalizeBindings, \ + PULPWelfordReduceBindings, \ + PULPGlobalAveragePool2DBindings, PULPGlobalAveragePoolGrad2DBindings, \ + PULPConcatBindings, PULPFloatConv2DBindings, PULPFloatConvGradBBindings, PULPFloatConvGradW2DBindings, \ + PULPMaxPoolGrad2DBindings, \ + PULPFloatConvGradX2DBindings, PULPFloatDWConv2DBindings, PULPFloatDWConvGradW2DBindings, \ + PULPFloatDWConvGradX2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ + PULPFloatPWConvGradW2DBindings, PULPFloatPWConvGradX2DBindings, PULPGatherBindings, PULPiHardswishBindings, \ + PULPInPlaceAccumulatorV2Bindings, PULPInPlaceAccumulatorV2TiledBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \ PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \ - PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \ + PULPReduceMeanBindings, \ + PULPReduceSumBindings, PULPReluBinding, PULPReluGradBinding, PULPReshapeBindings, PULPRQAddBindings, \ PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \ PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \ - PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, \ - PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings + PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossDualOutputBindings, \ + PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings, \ + PULPMSELossBindings, PULPMSELossGradBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \ RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ @@ -35,14 +45,31 @@ from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormGradTileConstraint, \ LayernormTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint, MaxPoolHWTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.AveragePoolTileConstraint import \ + AveragePoolCTileConstraint, AveragePoolHWTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolGradTileConstraint import MaxPoolGradCTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.ReduceMeanConstraint import ReduceMeanTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import ReluGradTileConstraint, SGDTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint +from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.InPlaceAccumulatorV2TileConstraint import InPlaceAccumulatorV2TileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.BatchNormTileConstraint import BatchNormInternalTileConstraint, \ + BatchNormalizationGradTileConstraint, WelfordReduceTileConstraint, ChannelNormalizeTileConstraint, \ + BNGradReduceTileConstraint, BNGradNormalizeTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GlobalAveragePoolTileConstraint import GlobalAveragePoolTileConstraint, \ + GlobalAveragePoolGradTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ConvGradConstraint import ConvGradBTileConstraint, \ + ConvGradX2DHWTileConstraint, \ + DWConvGradX2DTileConstraint, DWConvGradW2DTileConstraint, ConvGradW2DTileConstraint, ConvGradX2DIm2ColHWTileConstraint, PWConvGradXTileConstraint, PWConvGradWTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyLossDualOutputTileConstraint import \ + SoftmaxCrossEntropyLossDualOutputTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MSELossTileConstraint import \ + MSELossTileConstraint, MSELossGradTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings PULPRQSConv1DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSConv1DBindings, @@ -93,7 +120,16 @@ tileConstraint = MaxPoolCTileConstraint()) PULPMaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings, - tileConstraint = MaxPoolCTileConstraint()) + tileConstraint = MaxPoolHWTileConstraint()) + +PULPAveragePool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPAveragePool2DBindings, + tileConstraint = AveragePoolCTileConstraint()) + +PULPAveragePoolGrad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPAveragePoolGrad2DBindings, + tileConstraint = AveragePoolCTileConstraint()) + +PULPMaxPoolGrad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPoolGrad2DBindings, + tileConstraint = MaxPoolGradCTileConstraint()) PULPRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSBindings, tileConstraint = RequantShiftTileConstraint()) @@ -125,6 +161,9 @@ PULPReluTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPReluBinding], tileConstraint = UnaryTileConstraint()) +PULPReluGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPReluGradBinding], + tileConstraint = ReluGradTileConstraint()) + PULPLayernormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPLayernormBinding], tileConstraint = LayernormTileConstraint()) @@ -143,6 +182,10 @@ PULPSoftmaxCrossEntropyTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = PULPSoftmaxCrossEntropyLossBindings, tileConstraint = SoftmaxCrossEntropyTileConstraint()) +PULPSoftmaxCrossEntropyLossDualOutputTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPSoftmaxCrossEntropyLossDualOutputBindings, + tileConstraint = SoftmaxCrossEntropyLossDualOutputTileConstraint()) + PULPSoftmaxCrossEntropyGradTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = PULPSoftmaxCrossEntropyLossGradBindings, tileConstraint = SoftmaxCrossEntropyGradTileConstraint()) @@ -155,8 +198,62 @@ PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings, tileConstraint = SGDTileConstraint()) +PULPInPlaceAccumulatorV2TilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPInPlaceAccumulatorV2TiledBindings, tileConstraint = InPlaceAccumulatorV2TileConstraint()) + PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings, tileConstraint = SliceTileConstraint()) PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, tileConstraint = ReduceMeanTileConstraint()) + +PULPConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradX2DBindings, + tileConstraint = ConvGradX2DIm2ColHWTileConstraint()) + +PULPConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvGradW2DBindings, + tileConstraint = ConvGradW2DTileConstraint()) + +PULPDWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConvGradX2DBindings, + tileConstraint = DWConvGradX2DTileConstraint()) + +PULPDWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConvGradW2DBindings, + tileConstraint = DWConvGradW2DTileConstraint()) + +PULPPWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = PULPFloatPWConvGradW2DBindings, + tileConstraint = PWConvGradWTileConstraint()) + +PULPPWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings( nodeBindings = PULPFloatPWConvGradX2DBindings, + tileConstraint = PWConvGradXTileConstraint()) + +PULPBatchNormInternalTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPBatchNormInternalBindings, tileConstraint = BatchNormInternalTileConstraint()) + +PULPBatchNormalizationGradTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPBatchNormalizationGradBindings, tileConstraint = BatchNormalizationGradTileConstraint()) + +PULPWelfordReduceTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPWelfordReduceBindings, tileConstraint = WelfordReduceTileConstraint()) + +PULPChannelNormalizeTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPChannelNormalizeBindings, tileConstraint = ChannelNormalizeTileConstraint()) + +PULPBNGradReduceTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPBNGradReduceBindings, tileConstraint = BNGradReduceTileConstraint()) + +PULPBNGradNormalizeTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPBNGradNormalizeBindings, tileConstraint = BNGradNormalizeTileConstraint()) + +PULPGlobalAveragePool2DTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPGlobalAveragePool2DBindings, tileConstraint = GlobalAveragePoolTileConstraint()) + +PULPGlobalAveragePoolGrad2DTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPGlobalAveragePoolGrad2DBindings, tileConstraint = GlobalAveragePoolGradTileConstraint()) + +PULPMSELossTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPMSELossBindings, tileConstraint = MSELossTileConstraint()) + +PULPMSELossGradTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPMSELossGradBindings, tileConstraint = MSELossGradTileConstraint()) + +PULPConvGradBTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = PULPFloatConvGradBBindings, tileConstraint = ConvGradBTileConstraint()) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/SplitConvGradPass.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/SplitConvGradPass.py new file mode 100644 index 0000000000..a4f1149af7 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/SplitConvGradPass.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import onnx_graphsurgeon as gs + +from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import Pass, contextagnostic + + +def _split_single_conv_grad(graph: gs.Graph, node: gs.Node, counter: int): + """Split one ConvGrad node → ConvGradX + ConvGradW [+ ConvGradB]. + + Original ConvGrad: + inputs : [dY, X, W] (no bias) + outputs: [dX, dW] + + or: + inputs : [dY, X, W, B] (with bias) + outputs: [dX, dW, dB] + + After split: + ConvGradX: inputs=[dY, W] → outputs=[dX] + ConvGradW: inputs=[dY, X] → outputs=[dW] + ConvGradB: inputs=[dY] → outputs=[dB] (only when bias present) + """ + if len(node.inputs) < 3 or len(node.outputs) < 1: + return + + dy = node.inputs[0] # dY: upstream gradient [N, C_out, H_out, W_out] + x = node.inputs[1] # X: forward input [N, C_in, H_in, W_in] + w = node.inputs[2] # W: weight [C_out, C_in/group, kH, kW] + + dx = node.outputs[0] # dX: input gradient [N, C_in, H_in, W_in] + + # Copy attrs; add kernel_shape from the weight tensor to avoid + # Conv2DParser.parseNode computing wrong kernel_shape from inputs[1]. + attrs_x = dict(node.attrs) + attrs_w = dict(node.attrs) + + if 'kernel_shape' not in attrs_x and w.shape is not None and len(w.shape) >= 4: + attrs_x['kernel_shape'] = list(w.shape[2:4]) + + base_name = node.name if node.name else f'ConvGrad_{counter}' + + # ConvGradX: compute dX from dY and W + conv_grad_x = gs.Node( + op = 'ConvGradX', + name = f'{base_name}_ConvGradX', + inputs = [dy, w], + outputs = [dx], + attrs = attrs_x, + ) + graph.nodes.append(conv_grad_x) + + if len(node.outputs) >= 2: + dw = node.outputs[1] # dW: weight gradient [C_out, C_in/group, kH, kW] + + # Propagate shape and dtype from W → dW (same shape; ONNX shape inference misses ConvGrad) + if dw.shape is None and w.shape is not None: + dw.shape = list(w.shape) + if dw.dtype is None and w.dtype is not None: + dw.dtype = w.dtype + + if 'kernel_shape' not in attrs_w and w.shape is not None and len(w.shape) >= 4: + attrs_w['kernel_shape'] = list(w.shape[2:4]) + elif 'kernel_shape' not in attrs_w and dw.shape is not None and len(dw.shape) >= 4: + attrs_w['kernel_shape'] = list(dw.shape[2:4]) + + # ConvGradW: compute dW from dY and X + conv_grad_w = gs.Node( + op = 'ConvGradW', + name = f'{base_name}_ConvGradW', + inputs = [dy, x], + outputs = [dw], + attrs = attrs_w, + ) + graph.nodes.append(conv_grad_w) + + if len(node.outputs) >= 3: + db = node.outputs[2] # dB: bias gradient [C_out] + + # Propagate bias shape and dtype: dB shape == B shape (or [C_out] from W) + if db.shape is None: + if len(node.inputs) >= 4 and node.inputs[3].shape is not None: + db.shape = list(node.inputs[3].shape) + elif w.shape is not None: + db.shape = [w.shape[0]] + if db.dtype is None: + if len(node.inputs) >= 4 and node.inputs[3].dtype is not None: + db.dtype = node.inputs[3].dtype + elif w.dtype is not None: + db.dtype = w.dtype + + # ConvGradB: compute dB = sum(dY, axes=[N, H, W]) + conv_grad_b = gs.Node( + op = 'ConvGradB', + name = f'{base_name}_ConvGradB', + inputs = [dy], + outputs = [db], + attrs = {}, + ) + graph.nodes.append(conv_grad_b) + + # Remove the original ConvGrad node + node.inputs.clear() + node.outputs.clear() + graph.nodes.remove(node) + + +@contextagnostic +class SplitConvGradPass(Pass): + """Replace each ConvGrad node with ConvGradX + ConvGradW[B] nodes. + + Handles 1/2/3 outputs: + 1 output (dX only): ConvGradX + 2 outputs (dX + dW): ConvGradX + ConvGradW + 3 outputs (dX + dW + dB): ConvGradX + ConvGradW + ConvGradB + + No-op for inference graphs (which have no ConvGrad nodes). + """ + + def run_pass(self, graph: gs.Graph) -> gs.Graph: + # Collect all ConvGrad nodes first (avoid modifying list while iterating) + nodes_to_split = [n for n in graph.nodes if n.op == 'ConvGrad'] + + for counter, node in enumerate(nodes_to_split): + _split_single_conv_grad(graph, node, counter) + + if nodes_to_split: + graph.cleanup() + + return graph diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py index 76eacd10dd..fc24375968 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py @@ -177,7 +177,7 @@ class TilingVariableReplacementUpdate(CodeTransformationPass, IntrospectiveCodeT _updateReferenceTemplate = NodeTemplate(""" // UPDATE VARIABLE ${reference} - *${reference} = ${baseReference}[${tileIdxVar}]; + ${reference} = &${baseReference}[${tileIdxVar}]; """) def __init__(self, targetMemLevel: str, tileIdxVar: str = "TILING_I"): diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index 2186d4d4c4..e42ddf13ad 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -333,7 +333,7 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext, if _buffer._memoryLevel != memoryLevel: continue - if hasattr(_buffer, "_alias") and ctxt.is_global(_buffer._alias): + if hasattr(_buffer, "_alias") and ctxt.is_global(_buffer._alias) and _buffer._alias not in blockNames: continue if hasattr(_buffer, "_alias") and _buffer._alias in blockNames: @@ -398,11 +398,32 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor environment variable to be set to the installation directory. """ + blockNames = [block.name for block in memoryMap] + + # In-place alias outputs are costless — their storage is + # already accounted for by the alias target. This mirrors the + # zero-cost logic in _buildCostVector (MemoryScheduler.py) and the + # skip logic in _allocateStaticBuffer. + # We skip them from the MiniMalloc CSV (MiniMalloc does not accept + # size-0 entries) and resolve their addrSpace from the alias target + # after the solver runs. + # NOTE: Only skip when alias target is in the SAME memoryMap. + # When alias target is global (e.g. L2 weight) but we're allocating + # L1, the buffer still needs its own L1 space. + aliasBlocks = set() + for memoryBlock in memoryMap: + _buffer = ctxt.lookup(memoryBlock.name) + if hasattr(_buffer, "_alias") and _buffer._alias in blockNames: + aliasBlocks.add(memoryBlock.name) + with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file: writer = csv.writer(file, lineterminator = "\n") writer.writerow(["id", "lower", "upper", "size"]) for memoryBlock in memoryMap: + if memoryBlock.name in aliasBlocks: + continue + _buffer = ctxt.lookup(memoryBlock.name) if nodeMemoryConstraint is None: _bufferSize = _buffer.size if isinstance( @@ -419,11 +440,12 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor 8) * nodeMemoryConstraint.tensorMemoryConstraints[ memoryBlock.name].memoryConstraints[memoryLevel].multiBufferCoefficient + _alignedSize = ((int(_bufferSize) + 3) // 4) * 4 writer.writerow([ memoryBlock.name, str(memoryBlock.lifetime[0]), str(memoryBlock.lifetime[1] + 1), - str(int(_bufferSize)) + str(_alignedSize) ]) try: @@ -452,6 +474,21 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor if memoryBlock.name == row[0]: memoryBlock._addrSpace = (int(row[-1]), int(row[-1]) + int(row[-2])) + # JUNGVI: Alias blocks were skipped in the MiniMalloc CSV. + # Resolve their addrSpace from their alias target so that + # downstream code can access it if needed. + for memoryBlock in memoryMap: + if memoryBlock.name in aliasBlocks: + _buffer = ctxt.lookup(memoryBlock.name) + aliasTarget = ctxt.dealiasBuffer(memoryBlock.name) + for targetBlock in memoryMap: + if targetBlock.name == aliasTarget: + memoryBlock._addrSpace = targetBlock._addrSpace + break + else: + # Alias target not in this memoryMap — use zero offset + memoryBlock._addrSpace = (0, 0) + return memoryMap def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution: diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index b7f3535790..3d6480d5f9 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -6,8 +6,16 @@ include_directories(${GENERATED_SOURCE}) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -add_library(network OBJECT ${GENERATED_SOURCE}/Network.c) -target_link_libraries(network PUBLIC deeploylib) +if(TRAINING) + add_library(training_network OBJECT ${GENERATED_SOURCE}/TrainingNetwork.c) + target_link_libraries(training_network PUBLIC deeploylib) + # Optimizer network (SGD kernel, compiled separately to allow different prefix) + add_library(optimizer_network OBJECT ${GENERATED_SOURCE}/OptimizerNetwork.c) + target_link_libraries(optimizer_network PUBLIC deeploylib) +else() + add_library(network OBJECT ${GENERATED_SOURCE}/Network.c) + target_link_libraries(network PUBLIC deeploylib) +endif() if(platform STREQUAL MemPool) add_subdirectory(Platforms/MemPool) @@ -29,7 +37,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) ) if (NOT HEXLIST) - target_compile_options(network PUBLIC -DNOFLASH) + if(TRAINING) + target_compile_options(training_network PUBLIC -DNOFLASH) + target_compile_options(optimizer_network PUBLIC -DNOFLASH) + else() + target_compile_options(network PUBLIC -DNOFLASH) + endif() else() gvsoc_flags_add_files_to_hyperflash(GVSOC_HEX_HYPERFLASH_FLAGS HEXLIST) list(APPEND GVSOC_EXTRA_FLAGS ${GVSOC_HEX_HYPERFLASH_FLAGS}) @@ -37,9 +50,12 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) # SCHEREMO: Waive warnings # Pointer sign warnings are caused by the data width abstraction used in Deeploy. Signedness is not explicitly modelled, as this is handled by kernels - target_compile_options(network PRIVATE - -Wno-pointer-sign - ) + if(TRAINING) + target_compile_options(training_network PRIVATE -Wno-pointer-sign) + target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign) + else() + target_compile_options(network PRIVATE -Wno-pointer-sign) + endif() if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) add_subdirectory(Platforms/Siracusa) @@ -61,7 +77,12 @@ elseif(platform STREQUAL GAP9) if (NOT HEXLIST) # L2 mode: No flash/readfs files # Data lives in L2 memory only - target_compile_options(network PUBLIC -DNOFLASH) + if(TRAINING) + target_compile_options(training_network PUBLIC -DNOFLASH) + target_compile_options(optimizer_network PUBLIC -DNOFLASH) + else() + target_compile_options(network PUBLIC -DNOFLASH) + endif() message(STATUS "[Deeploy GAP9] L2 mode: No hex files found, -DNOFLASH set") message(STATUS "[Deeploy GAP9] If you expect L3 mode, ensure Python codegen created hex files in ${GENERATED_SOURCE}/hex/") else() @@ -77,5 +98,13 @@ elseif(platform STREQUAL GAP9) message(STATUS "GAPY_RUNNER_ARGS: ${GAPY_RUNNER_ARGS}") endif() + # Waive warnings in generated code + if(TRAINING) + target_compile_options(training_network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + target_compile_options(optimizer_network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + else() + target_compile_options(network PRIVATE -Wno-pointer-sign -Wno-sign-compare) + endif() + add_subdirectory(Platforms/GAP9) endif() diff --git a/DeeployTest/Platforms/GAP9/CMakeLists.txt b/DeeployTest/Platforms/GAP9/CMakeLists.txt index db06a4e38f..f2d42973be 100644 --- a/DeeployTest/Platforms/GAP9/CMakeLists.txt +++ b/DeeployTest/Platforms/GAP9/CMakeLists.txt @@ -4,25 +4,51 @@ set(ProjectId ${TESTNAME}) +option(TRAINING "Use training harness instead of inference harness" OFF) +# Compile-time training parameters (override via -D on cmake command line) +set(N_TRAIN_STEPS "1" CACHE STRING "Number of optimizer steps") +set(N_ACCUM_STEPS "1" CACHE STRING "Number of mini-batches per optimizer step") +set(TRAINING_NUM_DATA_INPUTS "2" CACHE STRING "Number of data inputs per mini-batch") set(${SDKCONFIG_FILE} ${CMAKE_CURRENT_LIST_DIR}/sdk.config) -file(GLOB_RECURSE SOURCES - src/CycleCounter.c - src/deeploytest.c -) +if(TRAINING) + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytraintest.c + ) + set(NETWORK_LIB training_network) +else() + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytest.c + ) + set(NETWORK_LIB network) +endif() add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) -# add_executable(${ProjectId} ${SOURCES}) target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc) -target_link_libraries(${ProjectId} PRIVATE network deeploylib) -target_compile_options(${ProjectId} INTERFACE network) +if(TRAINING) + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} optimizer_network deeploylib) +else() + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} deeploylib) +endif() +target_compile_options(${ProjectId} INTERFACE ${NETWORK_LIB}) + +if(TRAINING) + target_compile_definitions(${ProjectId} PRIVATE + N_TRAIN_STEPS=${N_TRAIN_STEPS} + N_ACCUM_STEPS=${N_ACCUM_STEPS} + TRAINING_NUM_DATA_INPUTS=${TRAINING_NUM_DATA_INPUTS} + ) +endif() + add_gvsoc_emulation(${ProjectId} "gap9.evk") # RW: Waive sign comparison warnings from pulp_nn_utils.h -target_compile_options(network PRIVATE +target_compile_options(${NETWORK_LIB} PRIVATE -Wno-sign-compare -Wno-pointer-sign -Wno-unknown-pragmas diff --git a/DeeployTest/Platforms/GAP9/src/deeploytraintest.c b/DeeployTest/Platforms/GAP9/src/deeploytraintest.c new file mode 100644 index 0000000000..63cd2270a4 --- /dev/null +++ b/DeeployTest/Platforms/GAP9/src/deeploytraintest.c @@ -0,0 +1,453 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Training harness for GAP9 — Phase 2 (with Deeploy-compiled OptimizerNetwork) + * + * Adapted from Siracusa training harness for GAP9 platform. + * + * Loop structure: + * + * InitTrainingNetwork() + * InitOptimizerNetwork() + * + * for update_step in [0, N_TRAIN_STEPS): // optimizer steps + * for accum_step in [0, N_ACCUM_STEPS): // mini-batches per update + * lazy_reset_grad = (accum_step == 0) + * load data for this mini-batch + * RunTrainingNetwork() // fwd + bwd + InPlaceAccumulatorV2 + * store loss value + * // SGD weight update via Deeploy-compiled optimizer kernel: + * copy weights + grad_acc → optimizer input buffers + * RunOptimizerNetwork() + * copy weight_updated ← optimizer output buffers → training weight buffers + * + * Numerical verification: + * - Compare stored loss values against testLossRef[] (from testoutputs.h) + * + * Buffer layout in DeeployNetwork_inputs[] (must match ONNX input order): + * [0 .. TRAINING_NUM_DATA_INPUTS-1] data + labels (per mini-batch) + * [TRAINING_NUM_DATA_INPUTS .. + * .. TRAINING_GRAD_BUF_START_IDX-1] weights (persistent) + * [TRAINING_GRAD_BUF_START_IDX .. + * .. +TRAINING_NUM_GRAD_INPUTS-1] grad accumulation bufs (persistent) + * [DeeployNetwork_num_inputs-1] lazy_reset_grad uint8 + */ + +#include +#include +#include + +#include "CycleCounter.h" +#include "OptimizerNetwork.h" +#include "TrainingNetwork.h" +#include "dory_mem.h" +#include "pmsis.h" +#include "testinputs.h" +#include "testoutputs.h" + +/* Helper: true when ptr is in L2 (CPU-accessible); false when in L3 (external RAM) */ +#define IS_L2(ptr) ((uint32_t)(ptr) >= 0x10000000u) + +/* ------------------------------------------------------------------------- + * Compile-time defaults — override via CMake target_compile_definitions + * ---------------------------------------------------------------------- */ + +#ifndef N_TRAIN_STEPS +#define N_TRAIN_STEPS 1 +#endif + +#ifndef N_ACCUM_STEPS +#define N_ACCUM_STEPS 1 +#endif + +#ifndef TRAINING_NUM_DATA_INPUTS +#define TRAINING_NUM_DATA_INPUTS 2 +#endif + +/* Training networks are much larger than inference; the master core needs + * a bigger stack for the generated RunTrainingNetwork/InitTrainingNetwork + * functions which have many local variables across deep closure chains. */ +#define MAINSTACKSIZE 12000 +#define SLAVESTACKSIZE 6000 + +/* ------------------------------------------------------------------------- + * Cluster device + * ---------------------------------------------------------------------- */ + +struct pi_device cluster_dev; + +/* ------------------------------------------------------------------------- + * Loss storage (one value per forward pass) + * ---------------------------------------------------------------------- */ + +#define TOTAL_FWD_PASSES (N_TRAIN_STEPS * N_ACCUM_STEPS) +static float stored_losses[TOTAL_FWD_PASSES]; + +/* ------------------------------------------------------------------------- + * L1 arena sharing. + * + * GAP9 L1 is only ~131 KB. Large training models need ~64 KB for the + * Training arena and ~64 KB for the Optimizer arena; both cannot coexist. + * Since RunTrainingNetwork and RunOptimizerNetwork never execute at the + * same time, we allocate a single shared L1 buffer of max(train, opt) + * and point both MEMORYARENA_L1 globals to it. + * ---------------------------------------------------------------------- */ + +/* Weak extern declarations for L1 arena pointers — resolve to NULL when + * the generated code doesn't define them (non-tiled builds). */ +int8_t *DeeployNetwork_MEMORYARENA_L1 __attribute__((weak)); +int8_t *DeeployOptNetwork_MEMORYARENA_L1 __attribute__((weak)); + +/* Saved L1 arena sizes — measured from L1 heap delta during Init calls. */ +static uint32_t _train_l1_size = 0; +static uint32_t _opt_l1_size = 0; + +/* Measure L1 heap consumed by a function call (must run on cluster). */ +static uint32_t _l1_heap_used_by(void (*fn)(uint32_t, uint32_t)) { + uint32_t avail_before = 0, avail_after = 0, dummy = 0; + pi_cl_l1_available_get(&avail_before, &dummy); + fn(pi_core_id(), pi_cl_cluster_nb_cores()); + pi_cl_l1_available_get(&avail_after, &dummy); + return (avail_before > avail_after) ? (avail_before - avail_after) : 0; +} + +/* Free Training L1, init Optimizer, free Optimizer L1, alloc shared. + * All L1 operations run on the cluster (pi_l1_malloc/free require it). */ +static void InitOptAndShareL1(void *args) { + (void)args; + + /* Free Training L1 so Optimizer Init has room in the L1 heap. */ + if (_train_l1_size > 0 && DeeployNetwork_MEMORYARENA_L1) { + pi_l1_free((void *)0, DeeployNetwork_MEMORYARENA_L1, _train_l1_size); + } + + /* Init Optimizer — measure its L1 consumption. */ + _opt_l1_size = _l1_heap_used_by(InitOptimizerNetwork); + + /* If both networks need L1, create a shared arena. */ + if (_train_l1_size > 0 && _opt_l1_size > 0) { + pi_l1_free((void *)0, DeeployOptNetwork_MEMORYARENA_L1, _opt_l1_size); + + uint32_t max_l1 = (_train_l1_size > _opt_l1_size) ? _train_l1_size : _opt_l1_size; + int8_t *shared = (int8_t *)pi_l1_malloc((void *)0, max_l1); + DeeployNetwork_MEMORYARENA_L1 = shared; + DeeployOptNetwork_MEMORYARENA_L1 = shared; + } else if (_train_l1_size > 0) { + /* Optimizer doesn't use L1; re-alloc Training L1. */ + DeeployNetwork_MEMORYARENA_L1 = (int8_t *)pi_l1_malloc((void *)0, _train_l1_size); + } +} + +/* ------------------------------------------------------------------------- + * Wrapper functions for cluster task dispatch. + * + * GAP9 code generator produces functions with + * (uint32_t core_id, uint32_t numThreads) + * parameters, while pi_cluster_task expects void (*)(void*). + * ---------------------------------------------------------------------- */ + +void InitTrainingNetworkWrapper(void *args) { + (void)args; + _train_l1_size = _l1_heap_used_by(InitTrainingNetwork); +} + +void RunTrainingNetworkWrapper(void *args) { + (void)args; + RunTrainingNetwork(pi_core_id(), pi_cl_cluster_nb_cores()); +} + +void RunOptimizerNetworkWrapper(void *args) { + (void)args; + RunOptimizerNetwork(pi_core_id(), pi_cl_cluster_nb_cores()); +} + +/* ------------------------------------------------------------------------- + * L3-aware memory transfer: handles all combinations of L2/L3 src and dst + * ---------------------------------------------------------------------- */ + +static void l3_aware_copy(void *dst, const void *src, uint32_t bytes) { + if (IS_L2(dst) && IS_L2(src)) { + memcpy(dst, src, bytes); + } else if (IS_L2(dst)) { + /* L3 → L2 */ + ram_read(dst, (void *)src, bytes); + } else if (IS_L2(src)) { + /* L2 → L3 */ + ram_write(dst, (void *)src, bytes); + } else { + /* L3 → L3: stage through a temporary L2 buffer */ + void *tmp = pi_l2_malloc(bytes); + ram_read(tmp, (void *)src, bytes); + ram_write(dst, tmp, bytes); + pi_l2_free(tmp, bytes); + } +} + +/* ------------------------------------------------------------------------- + * Optimizer step: copy buffers → run → copy back + * ---------------------------------------------------------------------- */ + +static void run_optimizer_step(void) { +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + /* --- Step A: copy current weights + grad acc → optimizer input buffers --- + * Skipped when codegen has shared the buffers (pointer equality test). */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t train_g_idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + wi; + uint32_t opt_w_in = 2u * wi; + uint32_t opt_g_in = 2u * wi + 1u; + + if (DeeployOptNetwork_inputs[opt_w_in] != DeeployNetwork_inputs[train_w_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_w_in], + DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_inputs_bytes[opt_w_in]); + } + if (DeeployOptNetwork_inputs[opt_g_in] != DeeployNetwork_inputs[train_g_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_g_in], + DeeployNetwork_inputs[train_g_idx], + DeeployOptNetwork_inputs_bytes[opt_g_in]); + } + } + + /* --- Step B: Run optimizer network --- */ + struct pi_cluster_task opt_task; + pi_cluster_task(&opt_task, RunOptimizerNetworkWrapper, NULL); + opt_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &opt_task); + + /* --- Step C: copy weight_updated back to training network --- + * Skipped when codegen has shared the output buffer with the training input. */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t opt_w_out = wi; + + if (DeeployOptNetwork_outputs[opt_w_out] == DeeployNetwork_inputs[train_w_idx]) { + continue; /* in-place: training buffer already updated */ + } + + uint32_t opt_bytes = DeeployOptNetwork_outputs_bytes[opt_w_out]; + uint32_t train_bytes = DeeployNetwork_inputs_bytes[train_w_idx]; + if (opt_bytes == train_bytes) { + l3_aware_copy(DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_outputs[opt_w_out], + opt_bytes); + } else { + /* Broadcasted bias: fill every tile with updated value. */ + for (uint32_t off = 0; off < train_bytes; off += opt_bytes) { + uint32_t chunk = (off + opt_bytes <= train_bytes) ? opt_bytes : (train_bytes - off); + l3_aware_copy((char *)DeeployNetwork_inputs[train_w_idx] + off, + DeeployOptNetwork_outputs[opt_w_out], + chunk); + } + } + } +#endif /* TRAINING_NUM_WEIGHT_INPUTS */ +} + +/* ------------------------------------------------------------------------- + * Numerical comparison — run on cluster (FC has no FPU) + * ---------------------------------------------------------------------- */ + +typedef struct { + float *computed; + float *reference; + uint32_t n; + uint32_t *err_count; +} LossCompareArgs; + +static void CompareLossesOnCluster(void *args) { + if (pi_core_id() != 0) return; + LossCompareArgs *a = (LossCompareArgs *)args; + float tol = TRAINING_TOLERANCE_ABS; + uint32_t errors = 0; + for (uint32_t i = 0; i < a->n; i++) { + float diff = a->computed[i] - a->reference[i]; + if (diff < 0.0f) diff = -diff; + printf(" [loss %u] computed=%.6f ref=%.6f diff=%.6f TOL=%.6f\r\n", + i, (double)a->computed[i], (double)a->reference[i], + (double)diff, (double)tol); + if (diff > tol) { + errors++; + } + } + *a->err_count = errors; +} + +static void CL_CompareLosses(void *arg) { + pi_cl_team_fork(NUM_CORES, CompareLossesOnCluster, arg); +} + +/* ------------------------------------------------------------------------- + * main + * ---------------------------------------------------------------------- */ + +int main(void) { + + printf("=== GAP9 Training Harness (Phase 2 — with OptimizerNetwork) ===\r\n"); + printf("N_TRAIN_STEPS=%u N_ACCUM_STEPS=%u DATA_INPUTS=%u\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS, + (unsigned)TRAINING_NUM_DATA_INPUTS); + + /* ------------------------------------------------------------------ + * Cluster bring-up + * ------------------------------------------------------------------ */ + + struct pi_cluster_conf conf; + pi_cluster_conf_init(&conf); + conf.id = 0; + /* Training networks have deep closure chains with many local variables. + * The default PI_CL_CC_STACK_SIZE (0x800 = 2KB) is too small; increase + * the cluster controller (master core) stack to MAINSTACKSIZE. */ + conf.cc_stack_size = MAINSTACKSIZE; + pi_open_from_conf(&cluster_dev, &conf); + if (pi_cluster_open(&cluster_dev)) + return -1; + + mem_init(); +#ifndef NOFLASH + open_fs(); +#endif + + struct pi_cluster_task cluster_task; + + /* ------------------------------------------------------------------ + * Init training network (allocates L1 + L2 arenas, loads hex data) + * ------------------------------------------------------------------ */ + + printf("Initializing TrainingNetwork...\r\n"); + pi_cluster_task(&cluster_task, InitTrainingNetworkWrapper, NULL); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ------------------------------------------------------------------ + * Zero-initialise gradient accumulation buffers. + * ------------------------------------------------------------------ */ + + for (uint32_t _gi = 0; _gi < (uint32_t)TRAINING_NUM_GRAD_INPUTS; _gi++) { + uint32_t _idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + _gi; + uint32_t bytes = DeeployNetwork_inputs_bytes[_idx]; + void *buf = DeeployNetwork_inputs[_idx]; + if (IS_L2(buf)) { + memset(buf, 0, bytes); + } else { + /* Write zeros into L3 via DMA using a temporary L2 zero page */ + uint8_t *zero_page = pi_l2_malloc(512); + memset(zero_page, 0, 512); + for (uint32_t off = 0; off < bytes; off += 512) { + uint32_t chunk = (off + 512 <= bytes) ? 512 : (bytes - off); + ram_write((char *)buf + off, zero_page, chunk); + } + pi_l2_free(zero_page, 512); + } + } + + /* ------------------------------------------------------------------ + * Init optimizer network. + * + * GAP9 L1 is limited (~131 KB). Large models need ~64 KB each for + * Training and Optimizer L1 arenas — both can't coexist. We free + * Training's L1, init Optimizer (which allocates its own L1), then + * free both and create a single shared L1 arena. + * ------------------------------------------------------------------ */ + + printf("Initializing OptimizerNetwork...\r\n"); + /* Free Training L1, init Optimizer, then set up a shared L1 arena. + * All done in one cluster task since pi_l1_free/malloc are cluster APIs. */ + pi_cluster_task(&cluster_task, InitOptAndShareL1, NULL); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ------------------------------------------------------------------ + * lazy_reset_grad is the last input of the training network. + * ------------------------------------------------------------------ */ + + uint32_t reset_idx = DeeployNetwork_num_inputs - 1; + + /* ------------------------------------------------------------------ + * Copy initial weights into network input buffers. + * ------------------------------------------------------------------ */ + +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + l3_aware_copy(DeeployNetwork_inputs[idx], testInitWeights[wi], DeeployNetwork_inputs_bytes[idx]); + } +#endif + + printf("Starting training (%u optimizer steps x %u accum steps)...\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS); + + for (uint32_t update_step = 0; update_step < N_TRAIN_STEPS; update_step++) { + + for (uint32_t accum_step = 0; accum_step < N_ACCUM_STEPS; accum_step++) { + + uint32_t mb = update_step * N_ACCUM_STEPS + accum_step; + + printf(" update %u/%u accum %u/%u (mini-batch %u)\r\n", + update_step + 1, (unsigned)N_TRAIN_STEPS, + accum_step + 1, (unsigned)N_ACCUM_STEPS, + mb); + + /* 1. Set lazy_reset_grad. */ + { + void *reset_ptr = DeeployNetwork_inputs[reset_idx]; + uint8_t reset_val = (accum_step == 0) ? 1u : 0u; + if (IS_L2(reset_ptr)) { + *((uint8_t *)reset_ptr) = reset_val; + } else { + ram_write(reset_ptr, &reset_val, sizeof(uint8_t)); + } + } + + /* 2. Load this mini-batch's data + labels. */ + for (uint32_t buf = 0; buf < TRAINING_NUM_DATA_INPUTS; buf++) { + l3_aware_copy(DeeployNetwork_inputs[buf], + testDataVector[mb % TRAINING_DATA_SIZE][buf], + DeeployNetwork_inputs_bytes[buf]); + } + + /* 3. Forward + backward + InPlaceAccumulatorV2. */ + pi_cluster_task(&cluster_task, RunTrainingNetworkWrapper, NULL); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* 4. Store loss. */ + { + void *loss_ptr = DeeployNetwork_outputs[0]; + if (IS_L2(loss_ptr)) { + memcpy(&stored_losses[mb], loss_ptr, sizeof(float)); + } else { + ram_read(&stored_losses[mb], loss_ptr, sizeof(float)); + } + } + + } /* end accum_step loop */ + + /* 5. SGD weight update via Deeploy-compiled OptimizerNetwork. */ + run_optimizer_step(); + + } /* end update_step loop */ + + /* ------------------------------------------------------------------ + * Numerical verification — run on cluster (FC has no FPU) + * ------------------------------------------------------------------ */ + + uint32_t loss_err_count = 0; + uint32_t total_loss_checks = (TOTAL_FWD_PASSES < N_LOSS_REFS) ? TOTAL_FWD_PASSES : N_LOSS_REFS; + LossCompareArgs loss_cmp_args = { + .computed = stored_losses, + .reference = (float *)testLossRef, + .n = total_loss_checks, + .err_count = &loss_err_count, + }; + pi_cluster_task(&cluster_task, CL_CompareLosses, &loss_cmp_args); + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + printf("Errors: %u out of %u\r\n", (unsigned)loss_err_count, (unsigned)total_loss_checks); + + return 0; +} diff --git a/DeeployTest/Platforms/Siracusa/CMakeLists.txt b/DeeployTest/Platforms/Siracusa/CMakeLists.txt index 45e6191490..28ac5131f2 100644 --- a/DeeployTest/Platforms/Siracusa/CMakeLists.txt +++ b/DeeployTest/Platforms/Siracusa/CMakeLists.txt @@ -1,19 +1,46 @@ # SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna -# # SPDX-License-Identifier: Apache-2.0 set(ProjectId ${TESTNAME}) -file(GLOB_RECURSE SOURCES - src/CycleCounter.c - src/deeploytest.c -) +option(TRAINING "Use training harness instead of inference harness" OFF) + +# Compile-time training parameters (override via -D on cmake command line) +set(N_TRAIN_STEPS "1" CACHE STRING "Number of optimizer steps") +set(N_ACCUM_STEPS "1" CACHE STRING "Number of mini-batches per optimizer step") +set(TRAINING_NUM_DATA_INPUTS "2" CACHE STRING "Number of data inputs per mini-batch") + +if(TRAINING) + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytraintest.c + ) + set(NETWORK_LIB training_network) +else() + file(GLOB_RECURSE SOURCES + src/CycleCounter.c + src/deeploytest.c + ) + set(NETWORK_LIB network) +endif() add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) target_include_directories(${ProjectId} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/inc) -target_link_libraries(${ProjectId} PRIVATE network deeploylib) -target_compile_options(${ProjectId} INTERFACE network) -add_gvsoc_emulation(${ProjectId} "siracusa") +if(TRAINING) + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} optimizer_network deeploylib) +else() + target_link_libraries(${ProjectId} PRIVATE ${NETWORK_LIB} deeploylib) +endif() +target_compile_options(${ProjectId} INTERFACE ${NETWORK_LIB}) +if(TRAINING) + target_compile_definitions(${ProjectId} PRIVATE + N_TRAIN_STEPS=${N_TRAIN_STEPS} + N_ACCUM_STEPS=${N_ACCUM_STEPS} + TRAINING_NUM_DATA_INPUTS=${TRAINING_NUM_DATA_INPUTS} + ) +endif() + +add_gvsoc_emulation(${ProjectId} "siracusa") link_compile_dump(${TESTNAME}) diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c new file mode 100644 index 0000000000..2b43c90710 --- /dev/null +++ b/DeeployTest/Platforms/Siracusa/src/deeploytraintest.c @@ -0,0 +1,415 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Training harness for Siracusa — Phase 2 (with Deeploy-compiled OptimizerNetwork) + * + * Loop structure: + * + * InitTrainingNetwork() + * InitOptimizerNetwork() + * Connect optimizer buffers → training network's weight/grad buffers + * + * for update_step in [0, N_TRAIN_STEPS): // optimizer steps + * for accum_step in [0, N_ACCUM_STEPS): // mini-batches per update + * lazy_reset_grad = (accum_step == 0) // reset on first, accumulate on rest + * load data for this mini-batch + * RunTrainingNetwork() // fwd + bwd + InPlaceAccumulatorV2 + * store loss value + * // SGD weight update via Deeploy-compiled optimizer kernel: + * copy weights + grad_acc → optimizer input buffers + * RunOptimizerNetwork() + * copy weight_updated ← optimizer output buffers → training weight buffers + * + * Numerical verification: + * - Compare stored loss values against testLossRef[] (from testoutputs.h) + * + * Buffer layout in DeeployNetwork_inputs[] (must match ONNX input order): + * [0 .. TRAINING_NUM_DATA_INPUTS-1] data + labels (per mini-batch) + * [TRAINING_NUM_DATA_INPUTS .. + * .. TRAINING_GRAD_BUF_START_IDX-1] weights (persistent) + * [TRAINING_GRAD_BUF_START_IDX .. + * .. +TRAINING_NUM_GRAD_INPUTS-1] grad accumulation bufs (persistent) + * [DeeployNetwork_num_inputs-1] lazy_reset_grad uint8 + * + * Optimizer buffer layout in DeeployOptNetwork_inputs[] (interleaved pairs): + * [2*i] weight_i (copied from DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i]) + * [2*i+1] grad_acc_i (copied from DeeployNetwork_inputs[TRAINING_GRAD_BUF_START_IDX+i]) + * DeeployOptNetwork_outputs[i] = weight_i_updated + * → copied back to DeeployNetwork_inputs[TRAINING_NUM_DATA_INPUTS+i] + * + * Compile-time constants (emitted by code generator into testinputs.h): + * N_TRAIN_STEPS number of optimizer (weight-update) steps + * N_ACCUM_STEPS number of mini-batches accumulated per update + * TRAINING_NUM_DATA_INPUTS inputs that change each mini-batch (data + labels) + * TRAINING_GRAD_BUF_START_IDX first grad acc buffer index in DeeployNetwork_inputs[] + * TRAINING_NUM_GRAD_INPUTS number of grad accumulation buffers (== number of weights) + * TRAINING_NUM_WEIGHT_INPUTS number of trainable weight buffers + * TRAINING_LEARNING_RATE SGD learning rate (for reference — embedded in optimizer ONNX) + * + * Reference comparison constants (emitted into testoutputs.h): + * N_LOSS_REFS number of reference loss values + * NUM_WEIGHT_REFS number of reference weight tensors + * TRAINING_TOLERANCE_ABS absolute comparison tolerance + */ + +#include +#include +#include + +#include "CycleCounter.h" +#include "OptimizerNetwork.h" +#include "TrainingNetwork.h" +#include "dory_mem.h" +#include "pmsis.h" +#include "testinputs.h" +#include "testoutputs.h" + +/* Helper: true when ptr is in L2 (CPU-accessible); false when in L3 (external RAM) */ +#define IS_L2(ptr) ((uint32_t)(ptr) >= 0x10000000u) + +/* ------------------------------------------------------------------------- + * Compile-time defaults — override via CMake target_compile_definitions + * ---------------------------------------------------------------------- */ + +#ifndef N_TRAIN_STEPS +#define N_TRAIN_STEPS 1 +#endif + +#ifndef N_ACCUM_STEPS +#define N_ACCUM_STEPS 1 +#endif + +#ifndef TRAINING_NUM_DATA_INPUTS +#define TRAINING_NUM_DATA_INPUTS 2 +#endif + +#define MAINSTACKSIZE 12000 +#define SLAVESTACKSIZE 3800 + +/* ------------------------------------------------------------------------- + * Cluster device + * ---------------------------------------------------------------------- */ + +struct pi_device cluster_dev; + + +/* ------------------------------------------------------------------------- + * Loss storage (one value per forward pass) + * ---------------------------------------------------------------------- */ + +#define TOTAL_FWD_PASSES (N_TRAIN_STEPS * N_ACCUM_STEPS) +static float stored_losses[TOTAL_FWD_PASSES]; + +/* ------------------------------------------------------------------------- + * Optimizer buffer connection + * + * Connect DeeployOptNetwork_inputs[]/outputs[] to the training network's + * weight and grad acc buffers via memcpy. + * + * Optimizer ONNX input order: [w0, g0, w1, g1, ...] (interleaved pairs) + * Optimizer ONNX output order: [w0_updated, w1_updated, ...] + * ---------------------------------------------------------------------- */ + +/* ------------------------------------------------------------------------- + * L3-aware memory transfer: handles all combinations of L2/L3 src and dst + * ---------------------------------------------------------------------- */ + +static void l3_aware_copy(void *dst, const void *src, uint32_t bytes) { + if (IS_L2(dst) && IS_L2(src)) { + memcpy(dst, src, bytes); + } else if (IS_L2(dst)) { + /* L3 → L2 */ + ram_read(dst, (void *)src, bytes); + } else if (IS_L2(src)) { + /* L2 → L3 */ + ram_write(dst, (void *)src, bytes); + } else { + /* L3 → L3: stage through a temporary L2 buffer */ + void *tmp = pi_l2_malloc(bytes); + ram_read(tmp, (void *)src, bytes); + ram_write(dst, tmp, bytes); + pi_l2_free(tmp, bytes); + } +} + +static void connect_optimizer_buffers(void) { +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + /* Nothing to pre-allocate — InitOptimizerNetwork() already allocated the + * optimizer's static buffers and set DeeployOptNetwork_inputs[]/outputs[]. + * We only need to sync data at each optimizer step (see run_optimizer_step). */ + (void)0; +#endif +} + +static void run_optimizer_step(void) { +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + /* --- Step A: copy current weights + grad acc → optimizer input buffers --- + * Skipped when codegen has shared the buffers (pointer equality test). */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t train_g_idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + wi; + uint32_t opt_w_in = 2u * wi; + uint32_t opt_g_in = 2u * wi + 1u; + + if (DeeployOptNetwork_inputs[opt_w_in] != DeeployNetwork_inputs[train_w_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_w_in], + DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_inputs_bytes[opt_w_in]); + } + if (DeeployOptNetwork_inputs[opt_g_in] != DeeployNetwork_inputs[train_g_idx]) { + l3_aware_copy(DeeployOptNetwork_inputs[opt_g_in], + DeeployNetwork_inputs[train_g_idx], + DeeployOptNetwork_inputs_bytes[opt_g_in]); + } + } + + struct pi_cluster_task opt_task; + pi_cluster_task(&opt_task, RunOptimizerNetwork, NULL); + opt_task.stack_size = MAINSTACKSIZE; + opt_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &opt_task); + + /* --- Step C: copy weight_updated back to training network's weight buffers --- + * Skipped when codegen has shared the output buffer with the training input. */ + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t train_w_idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + uint32_t opt_w_out = wi; + + if (DeeployOptNetwork_outputs[opt_w_out] == DeeployNetwork_inputs[train_w_idx]) { + continue; /* in-place: training buffer already updated */ + } + + uint32_t opt_bytes = DeeployOptNetwork_outputs_bytes[opt_w_out]; + uint32_t train_bytes = DeeployNetwork_inputs_bytes[train_w_idx]; + if (opt_bytes == train_bytes) { + l3_aware_copy(DeeployNetwork_inputs[train_w_idx], + DeeployOptNetwork_outputs[opt_w_out], + opt_bytes); + } else { + /* Broadcasted bias: fill every tile with updated value. */ + for (uint32_t off = 0; off < train_bytes; off += opt_bytes) { + uint32_t chunk = (off + opt_bytes <= train_bytes) ? opt_bytes : (train_bytes - off); + l3_aware_copy((char *)DeeployNetwork_inputs[train_w_idx] + off, + DeeployOptNetwork_outputs[opt_w_out], + chunk); + } + } + } +#endif /* TRAINING_NUM_WEIGHT_INPUTS */ +} + +/* ------------------------------------------------------------------------- + * Numerical comparison helpers — run on cluster (FC has no FPU) + * ---------------------------------------------------------------------- */ + +typedef struct { + float *computed; + float *reference; + uint32_t n; + uint32_t *err_count; +} LossCompareArgs; + +static void CompareLossesOnCluster(void *args) { + if (pi_core_id() != 0) return; + LossCompareArgs *a = (LossCompareArgs *)args; + float tol = TRAINING_TOLERANCE_ABS; /* read on cluster — has FPU */ + uint32_t errors = 0; + for (uint32_t i = 0; i < a->n; i++) { + float diff = a->computed[i] - a->reference[i]; + if (diff < 0.0f) diff = -diff; + printf(" [loss %u] computed=%.6f ref=%.6f diff=%.6f TOL=%.6f\r\n", + i, (double)a->computed[i], (double)a->reference[i], + (double)diff, (double)tol); + if (diff > tol) { + errors++; + } + } + *a->err_count = errors; +} + +/* ------------------------------------------------------------------------- + * main + * ---------------------------------------------------------------------- */ + +int main(void) { + + +printf("=== Siracusa Training Harness (Phase 2 — with OptimizerNetwork) ===\r\n"); +printf("N_TRAIN_STEPS=%u N_ACCUM_STEPS=%u DATA_INPUTS=%u\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS, + (unsigned)TRAINING_NUM_DATA_INPUTS); + + +// /* ------------------------------------------------------------------ +// * Cluster bring-up +// * ------------------------------------------------------------------ */ + + struct pi_cluster_conf conf; + pi_cluster_conf_init(&conf); + conf.id = 0; + pi_open_from_conf(&cluster_dev, &conf); + if (pi_cluster_open(&cluster_dev)) + return -1; + +#ifndef NOFLASH + mem_init(); + open_fs(); +#endif + + struct pi_cluster_task cluster_task; + + /* ------------------------------------------------------------------ + * Init training network + * ------------------------------------------------------------------ */ + + printf("Initializing TrainingNetwork...\r\n"); + pi_cluster_task(&cluster_task, InitTrainingNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ------------------------------------------------------------------ + * Zero-initialise gradient accumulation buffers. + * ------------------------------------------------------------------ */ + + +for (uint32_t _gi = 0; _gi < (uint32_t)TRAINING_NUM_GRAD_INPUTS; _gi++) { + uint32_t _idx = (uint32_t)TRAINING_GRAD_BUF_START_IDX + _gi; + uint32_t bytes = DeeployNetwork_inputs_bytes[_idx]; + void *buf = DeeployNetwork_inputs[_idx]; + if (IS_L2(buf)) { + memset(buf, 0, bytes); + } else { + /* Write zeros into L3 via DMA using a temporary L2 zero page */ + uint8_t *zero_page = pi_l2_malloc(512); + memset(zero_page, 0, 512); + for (uint32_t off = 0; off < bytes; off += 512) { + uint32_t chunk = (off + 512 <= bytes) ? 512 : (bytes - off); + ram_write((char *)buf + off, zero_page, chunk); + } + pi_l2_free(zero_page, 512); + } +} + + /* ------------------------------------------------------------------ + * Init optimizer network + * ------------------------------------------------------------------ */ + + printf("Initializing OptimizerNetwork...\r\n"); + pi_cluster_task(&cluster_task, InitOptimizerNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + +// connect_optimizer_buffers(); + +// /* ------------------------------------------------------------------ +// * lazy_reset_grad is the last input of the training network. +// * ------------------------------------------------------------------ */ + + uint32_t reset_idx = DeeployNetwork_num_inputs - 1; + + /* ------------------------------------------------------------------ + * Copy initial weights into network input buffers. + * (InitTrainingNetwork only malloc's them; testInitWeights[] holds + * the actual starting values from inputs.npz.) + * ------------------------------------------------------------------ */ + +#if defined(TRAINING_NUM_WEIGHT_INPUTS) && (TRAINING_NUM_WEIGHT_INPUTS > 0) + for (uint32_t wi = 0; wi < (uint32_t)TRAINING_NUM_WEIGHT_INPUTS; wi++) { + uint32_t idx = (uint32_t)TRAINING_NUM_DATA_INPUTS + wi; + l3_aware_copy(DeeployNetwork_inputs[idx], testInitWeights[wi], DeeployNetwork_inputs_bytes[idx]); + } +#endif + + printf("Starting training (%u optimizer steps x %u accum steps)...\r\n", + (unsigned)N_TRAIN_STEPS, (unsigned)N_ACCUM_STEPS); + + uint32_t training_cycles = 0; + uint32_t optimizer_cycles = 0; + + for (uint32_t update_step = 0; update_step < N_TRAIN_STEPS; update_step++) { + + for (uint32_t accum_step = 0; accum_step < N_ACCUM_STEPS; accum_step++) { + + uint32_t mb = update_step * N_ACCUM_STEPS + accum_step; + + printf(" update %u/%u accum %u/%u (mini-batch %u)\r\n", + update_step + 1, (unsigned)N_TRAIN_STEPS, + accum_step + 1, (unsigned)N_ACCUM_STEPS, + mb); + + + /* ① Set lazy_reset_grad. */ + { + void *reset_ptr = DeeployNetwork_inputs[reset_idx]; + uint8_t reset_val = (accum_step == 0) ? 1u : 0u; + if (IS_L2(reset_ptr)) { + *((uint8_t *)reset_ptr) = reset_val; + } else { + ram_write(reset_ptr, &reset_val, sizeof(uint8_t)); + } + } + + /* ② Load this mini-batch's data + labels (cycle through unique samples). */ + for (uint32_t buf = 0; buf < TRAINING_NUM_DATA_INPUTS; buf++) { + l3_aware_copy(DeeployNetwork_inputs[buf], + testDataVector[mb % TRAINING_DATA_SIZE][buf], + DeeployNetwork_inputs_bytes[buf]); + } + + /* ③ Forward + backward + InPlaceAccumulatorV2. */ + pi_cluster_task(&cluster_task, RunTrainingNetwork, NULL); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + /* ④ Store loss — use memcpy to avoid float registers on FC (no FPU). */ + { + void *loss_ptr = DeeployNetwork_outputs[0]; + if (IS_L2(loss_ptr)) { + memcpy(&stored_losses[mb], loss_ptr, sizeof(float)); + } else { + ram_read(&stored_losses[mb], loss_ptr, sizeof(float)); + } + } + + } /* end accum_step loop */ + + /* ⑤ SGD weight update via Deeploy-compiled OptimizerNetwork. */ + run_optimizer_step(); + + } /* end update_step loop */ + + // printf("Training complete.\r\n"); + // printf("Total training cycles : %u\r\n", training_cycles); + // printf("Total optimizer cycles : %u\r\n", optimizer_cycles); + + + /* ------------------------------------------------------------------ + * Numerical verification — run on cluster (FC has no FPU) + * ------------------------------------------------------------------ */ + + uint32_t loss_err_count = 0; + uint32_t total_loss_checks = (TOTAL_FWD_PASSES < N_LOSS_REFS) ? TOTAL_FWD_PASSES : N_LOSS_REFS; + LossCompareArgs loss_cmp_args = { + .computed = stored_losses, + .reference = (float *)testLossRef, + .n = total_loss_checks, + .err_count = &loss_err_count, + }; + pi_cluster_task(&cluster_task, CompareLossesOnCluster, &loss_cmp_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + printf("Errors: %u out of %u\r\n", (unsigned)loss_err_count, (unsigned)total_loss_checks); + + + + return 0; + +} diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/inputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/inputs.npz new file mode 100644 index 0000000000..7ee6201e23 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/network.onnx b/DeeployTest/Tests/Kernels/FP32/AveragePool/network.onnx new file mode 100644 index 0000000000..4a9abeb668 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/outputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/outputs.npz new file mode 100644 index 0000000000..ad957a8592 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvGrad/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvGrad/inputs.npz new file mode 100644 index 0000000000..d0d8c91d6e Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvGrad/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvGrad/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvGrad/network.onnx new file mode 100644 index 0000000000..435e0483d1 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvGrad/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvGrad/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvGrad/outputs.npz new file mode 100644 index 0000000000..9d0abb7bcf Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvGrad/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/GELU/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GeLU/inputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/GELU/inputs.npz rename to DeeployTest/Tests/Kernels/FP32/GeLU/inputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/GELU/network.onnx b/DeeployTest/Tests/Kernels/FP32/GeLU/network.onnx similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/GELU/network.onnx rename to DeeployTest/Tests/Kernels/FP32/GeLU/network.onnx diff --git a/DeeployTest/Tests/Kernels/FP32/GELU/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GeLU/outputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/GELU/outputs.npz rename to DeeployTest/Tests/Kernels/FP32/GeLU/outputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/inputs.npz b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/inputs.npz new file mode 100644 index 0000000000..9324a1f1f0 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/network.onnx b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/network.onnx new file mode 100644 index 0000000000..b245cb6f24 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/outputs.npz b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/outputs.npz new file mode 100644 index 0000000000..6bbe1507e3 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/SoftmaxGrad/outputs.npz differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz deleted file mode 100644 index a9018350f2..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx deleted file mode 100644 index 7473d7e5c1..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx deleted file mode 100644 index 11b0ca1f69..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz deleted file mode 100644 index d2ad678b76..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz deleted file mode 100644 index 7af9629e9b..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx deleted file mode 100644 index ac9569fb58..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx deleted file mode 100644 index 366a0be89e..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz deleted file mode 100644 index c2850ae68a..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz deleted file mode 100644 index c32b8dfd64..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx deleted file mode 100644 index 798e35f96b..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx deleted file mode 100644 index 2eae9e8d7e..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz deleted file mode 100644 index bb23f3a08a..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz deleted file mode 100644 index c4296c01c6..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx deleted file mode 100644 index 8f183a9e2c..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx deleted file mode 100644 index 6cc128149a..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz deleted file mode 100644 index e34b4860ed..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz deleted file mode 100644 index 71d400304c..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx deleted file mode 100644 index 93a262b786..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx deleted file mode 100644 index 9c5a0963db..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz deleted file mode 100644 index b134b08d6a..0000000000 Binary files a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_optimizer/network.onnx b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_optimizer/network.onnx new file mode 100644 index 0000000000..d2fb55e00b Binary files /dev/null and b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_optimizer/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/inputs.npz b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/inputs.npz new file mode 100644 index 0000000000..7c5a6e0588 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/network.onnx b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/network.onnx new file mode 100644 index 0000000000..0311f8353f Binary files /dev/null and b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/outputs.npz b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/outputs.npz new file mode 100644 index 0000000000..e7f924dfec Binary files /dev/null and b/DeeployTest/Tests/Models/Training/Autoencoder/autoencoder_train/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_optimizer/network.onnx b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_optimizer/network.onnx new file mode 100644 index 0000000000..2c902b1c26 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_optimizer/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/inputs.npz b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/inputs.npz new file mode 100644 index 0000000000..5a236eb960 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/network.onnx b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/network.onnx new file mode 100644 index 0000000000..71df82b70a Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/outputs.npz b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/outputs.npz new file mode 100644 index 0000000000..176d82d2f3 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SmallTransformer/tinytransformer_train/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Transformer_Train/inputs.npz b/DeeployTest/Tests/Models/Transformer_Train/inputs.npz deleted file mode 100644 index 8bbe2fc71c..0000000000 Binary files a/DeeployTest/Tests/Models/Transformer_Train/inputs.npz and /dev/null differ diff --git a/DeeployTest/Tests/Models/Transformer_Train/network.onnx b/DeeployTest/Tests/Models/Transformer_Train/network.onnx deleted file mode 100644 index fb25e5d785..0000000000 Binary files a/DeeployTest/Tests/Models/Transformer_Train/network.onnx and /dev/null differ diff --git a/DeeployTest/Tests/Models/Transformer_Train/outputs.npz b/DeeployTest/Tests/Models/Transformer_Train/outputs.npz deleted file mode 100644 index 0de78e0880..0000000000 Binary files a/DeeployTest/Tests/Models/Transformer_Train/outputs.npz and /dev/null differ diff --git a/DeeployTest/deeployTrainingRunner.py b/DeeployTest/deeployTrainingRunner.py new file mode 100644 index 0000000000..815d713ad9 --- /dev/null +++ b/DeeployTest/deeployTrainingRunner.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +CLI runner for training tests on Siracusa and GAP9. + +Usage: + python deeployTrainingRunner.py -t [-p Siracusa|GAP9] [--tiled] [options] + +Examples: + python deeployTrainingRunner.py -t Tests/Models/MLP_Train/simplemlp_train + python deeployTrainingRunner.py -t Tests/Models/MLP_Train/simplemlp_train -p GAP9 + python deeployTrainingRunner.py -t Tests/Models/SmallTransformer/tinytransformer_train --tiled + python deeployTrainingRunner.py -t Tests/Models/SmallTransformer/tinytransformer_train --tiled -p GAP9 +""" + +import argparse +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + # Peek at --tiled and -p before passing to main(), which builds its own parser. + pre = argparse.ArgumentParser(add_help=False) + pre.add_argument('--tiled', action='store_true', default=False) + pre.add_argument('-p', '--platform', default='Siracusa') + known, _ = pre.parse_known_args() + + sys.exit(main(tiling_enabled=known.tiled, default_platform=known.platform)) diff --git a/DeeployTest/deeployTrainingRunner_gap9.py b/DeeployTest/deeployTrainingRunner_gap9.py new file mode 100644 index 0000000000..9c28b6a1bc --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_gap9.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = False, default_platform = 'GAP9')) diff --git a/DeeployTest/deeployTrainingRunner_siracusa.py b/DeeployTest/deeployTrainingRunner_siracusa.py new file mode 100644 index 0000000000..c13cc31411 --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_siracusa.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = False)) diff --git a/DeeployTest/deeployTrainingRunner_tiled_gap9.py b/DeeployTest/deeployTrainingRunner_tiled_gap9.py new file mode 100644 index 0000000000..8c6d6c6a79 --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_tiled_gap9.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = True, default_platform = 'GAP9')) diff --git a/DeeployTest/deeployTrainingRunner_tiled_siracusa.py b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py new file mode 100644 index 0000000000..3509fc04fe --- /dev/null +++ b/DeeployTest/deeployTrainingRunner_tiled_siracusa.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from testUtils.deeployTrainingRunner import main + +if __name__ == '__main__': + sys.exit(main(tiling_enabled = True)) diff --git a/DeeployTest/generateOptimizerNetwork.py b/DeeployTest/generateOptimizerNetwork.py new file mode 100644 index 0000000000..b2d3031fe9 --- /dev/null +++ b/DeeployTest/generateOptimizerNetwork.py @@ -0,0 +1,161 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Optimizer network code-generation entry point. + +Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits +OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory. + +The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the +default ``DeeployNetwork_``) so that it can be linked together with the +training network without symbol conflicts. + +Usage +----- + /usr/bin/python generateOptimizerNetwork.py \\ + -t \\ # directory containing network.onnx + -d \\ # where to write OptimizerNetwork.c/h + -p Siracusa \\ + --cores 8 \\ + --lr 0.001 +""" + +import os +import sys +from pathlib import Path + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerate import build_shared_buffer_maps, generateOptimizerTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine + + +def generateOptimizerNetwork(args): + log.debug("Arguments: %s", args) + + # 1. Load optimizer network.onnx + onnx_path = f'{args.dir}/network.onnx' + onnx_model = onnx.load_model(onnx_path) + graph = gs.import_onnx(onnx_model) + + log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}") + log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}") + + # 2. Platform setup + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 3. All optimizer inputs are float32 (weights + grad acc buffers). + graph_input_names = [inp.name for inp in onnx_model.graph.input] + inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))} + inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))} + + # 4. Create and prepare deployer + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name="DeeployOptimizerNetwork", + deeployStateDir=_DEEPLOYSTATEDIR, + inputOffsets=inputOffsets) + + # Set up memory hierarchy so AnnotateDefaultMemoryLevel assigns the correct + # memory level to ConstantBuffers (weights). The optimizer graph is NOT + # tiled, but it must share the same memory-level view as the training graph + # so that weights end up in the same physical location (L2 when L3 is the + # training default, see AnnotateDefaultMemoryLevel). + L3 = MemoryLevel(name="L3", neighbourNames=["L2"], size=64000000) + L2 = MemoryLevel(name="L2", neighbourNames=["L3", "L1"], size=args.l2) + L1 = MemoryLevel(name="L1", neighbourNames=["L2"], size=args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + defaultTargetMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemoryLevel) + deployer = MemoryDeployerWrapper(deployer, [AnnotateDefaultMemoryLevel(memoryHierarchy)]) + + verbosityCfg = _NoVerbosity + _ = deployer.prepare(verbosityCfg) + + # 5. Build shared-buffer maps when the training ONNX is available + shared_input_map: dict = {} + shared_output_map: dict = {} + training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None + if training_onnx and training_onnx.exists(): + shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model) + log.debug(f"[SharedBuffers] input map: {shared_input_map}") + log.debug(f"[SharedBuffers] output map: {shared_output_map}") + log.info(f"[OptimizerNetwork] Sharing {len(shared_input_map)} inputs and " + f"{len(shared_output_map)} outputs with TrainingNetwork") + else: + if args.training_dir: + log.warning(f"[OptimizerNetwork] training_dir set but {training_onnx} not found — " + "generating standalone OptimizerNetwork (no buffer sharing)") + + # 6. Generate OptimizerNetwork.c / OptimizerNetwork.h + os.makedirs(args.dumpdir, exist_ok=True) + generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map) + + log.info(f"Optimizer network code generated in: {args.dumpdir}") + print(f"[OptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}") + +if __name__ == '__main__': + + parser = TestGeneratorArgumentParser(description="Deeploy Optimizer Network Code Generation.") + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of cluster cores. Default: 1.", + ) + parser.add_argument( + "--lr", + type=float, + default=0.001, + help="Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.", + ) + parser.add_argument("--defaultMemLevel", type=str, default="L2", + help="Default memory level (L2 or L3). Must match the training graph. Default: L2.") + parser.add_argument("--l1", type=int, default=64000, help="L1 size in bytes. Default: 64000.") + parser.add_argument("--l2", type=int, default=1024000, help="L2 size in bytes. Default: 1024000.") + parser.add_argument( + "--training-dir", + type=str, + default=None, + help="Directory containing the training network.onnx. When provided, " + "weight and grad-acc buffers are shared with TrainingNetwork instead " + "of being allocated independently.", + ) + parser.add_argument('--shouldFail', action='store_true') + parser.set_defaults(shouldFail=False) + + args = parser.parse_args() + + try: + generateOptimizerNetwork(args) + except Exception as e: + if args.shouldFail: + print("\033[92mOptimizer network generation ended, failed as expected!\033[0m") + sys.exit(0) + else: + raise e + + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/generateTrainingNetwork.py b/DeeployTest/generateTrainingNetwork.py new file mode 100644 index 0000000000..d27e74aba8 --- /dev/null +++ b/DeeployTest/generateTrainingNetwork.py @@ -0,0 +1,373 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import sys + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerate import generateTrainingTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.typeMapping import inferTypeAndOffset + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t +from Deeploy.DeeployTypes import _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform + +_GRAD_ACC = "_grad.accumulation.buffer" + + +def _load_reference_losses(train_dir: str) -> list: + """Load reference loss values from outputs.npz.""" + outputs_path = os.path.join(train_dir, "outputs.npz") + if not os.path.exists(outputs_path): + log.warning(f"outputs.npz not found at {outputs_path} — loss comparison skipped") + return None + + try: + outputs = np.load(outputs_path) + except Exception as e: + log.warning(f"Failed to load outputs.npz: {e} — loss comparison skipped") + return None + + for key in outputs.files: + if 'loss' in key.lower(): + vals = [float(v) for v in np.array(outputs[key]).flatten().tolist()] + log.info(f"Reference losses loaded from outputs.npz['{key}']: {vals}") + return vals + + log.warning("No 'loss' key found in outputs.npz — loss comparison skipped") + return None + + +def _infer_num_data_inputs(inputs_path: str) -> int: + """Auto-detect number of data inputs from inputs.npz. + + Data inputs are the base arr_* entries that have per-mini-batch + variants (mb1_arr_*) in the npz — i.e. entries that actually change + across mini-batches. + + Raises ValueError if no mb1 entries are found (single-mini-batch case) + where the data/weight boundary cannot be determined automatically. + """ + inputs = np.load(inputs_path) + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + count = sum(1 for k in base_keys if f'mb1_{k}' in inputs.files) + if count == 0: + raise ValueError( + "Cannot auto-detect num_data_inputs: inputs.npz has only one mini-batch " + "(no mb1_arr_* entries found). Please pass --num-data-inputs explicitly." + ) + return count + + +def _infer_total_mb(inputs_path: str) -> int: + """Count total mini-batches from inputs.npz. + + New format: inputs.npz contains meta_n_batches (total training mini-batches) + and meta_data_size (number of unique samples stored; C harness cycles via modulo). + + Legacy format: count 1 + number of unique mb* indices. + """ + inputs = np.load(inputs_path) + if "meta_n_batches" in inputs.files: + return int(inputs["meta_n_batches"].flat[0]) + mb_indices = set() + for key in inputs.files: + if key.startswith('mb'): + try: + idx = int(key.split('_')[0][2:]) + mb_indices.add(idx) + except ValueError: + pass + return 1 + len(mb_indices) + + +def _infer_data_size(inputs_path: str) -> int: + """Return the number of unique input samples stored in inputs.npz. + + New format: reads meta_data_size. + Legacy format: same as _infer_total_mb (all batches were unique). + """ + inputs = np.load(inputs_path) + if "meta_data_size" in inputs.files: + return int(inputs["meta_data_size"].flat[0]) + return _infer_total_mb(inputs_path) + + +def _infer_n_accum(inputs_path: str) -> int: + """Return the gradient accumulation step count stored in inputs.npz. + + New format: reads meta_n_accum written by the exporter. + Legacy format: defaults to 1 (no gradient accumulation). + """ + inputs = np.load(inputs_path) + if "meta_n_accum" in inputs.files: + return int(inputs["meta_n_accum"].flat[0]) + return 1 + + +def generateTrainingNetwork(args): + log.debug("Arguments: %s", args) + + # 1. Load network.onnx (training graph) + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + # 1a. Handle UNDEFINED-typed outputs in training ONNX graphs. + # Backward pass ONNX often doesn't propagate types for gradient outputs. + # (i) Strip UNDEFINED-typed outputs that have no consumers. + # (ii) Patch UNDEFINED-typed outputs WITH consumers to float32 (training default). + _stripped = False + _patched = False + for node in graph.nodes: + filtered = [out for out in node.outputs + if not (out.dtype == 0 and len(out.outputs) == 0)] + if len(filtered) < len(node.outputs): + node.outputs = filtered + _stripped = True + for out in node.outputs: + if out.dtype == 0 and len(out.outputs) > 0: + out.dtype = np.dtype(np.float32) + _patched = True + if _stripped: + graph.cleanup() + log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes") + if _patched: + log.debug("Patched UNDEFINED-typed outputs with consumers to float32") + + # 2. Load inputs.npz (new format: no grad acc buf entries) + inputs_path = f'{args.dir}/inputs.npz' + inputs = np.load(inputs_path) + + # 3. Platform setup + platform, signProp = mapPlatform(args.platform) + + log.debug(f"Platform: {platform} (sign: {signProp})") + + # Set cores on cluster engines (same pattern as generateNetwork.py) + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 4. Identify grad acc buf positions in the ONNX graph. + graph_input_names = [inp.name for inp in onnx_graph.graph.input] + grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n} + non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set] + + # Base npz arrays: keys that are neither per-mb entries (mb*) nor metadata (meta_*) + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + npz_base = [inputs[k] for k in base_keys] + + if len(npz_base) != len(non_grad_indices): + raise ValueError( + f"inputs.npz has {len(npz_base)} base entries but network.onnx has " + f"{len(non_grad_indices)} non-grad-buf inputs. " + f"Re-generate inputs.npz with the updated exporter.") + + # Build inputTypes / inputOffsets for ALL graph input positions. + inputTypes = {} + inputOffsets = {} + + npz_idx = 0 + for graph_idx, name in enumerate(graph_input_names): + if graph_idx in grad_acc_set: + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + arr = npz_base[npz_idx] + npz_idx += 1 + + if arr.dtype == bool or arr.dtype == np.bool_: + inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif arr.dtype in (np.float32, np.float64): + # Float32 training parameters always stay float32. + # inferTypeAndOffset would misclassify integer-valued floats + # (e.g. LayerNorm gamma=1.0 / beta=0.0) as int8_t. + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif np.prod(arr.shape) == 0: + pass + else: + values = arr.reshape(-1).astype(np.float32) + _type, offset = inferTypeAndOffset(values, signProp=False) + inputTypes[f"input_{graph_idx}"] = _type + inputOffsets[f"input_{graph_idx}"] = offset + + # 5. Create deployer + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name="DeeployTrainingNetwork", + deeployStateDir=_DEEPLOYSTATEDIR, + inputOffsets=inputOffsets) + + log.debug(f"Deployer: {deployer}") + + # 6. Prepare deployer + verbosityCfg = _NoVerbosity + + _ = deployer.prepare(verbosityCfg) + + # 7. Resolve num_data_inputs, n_steps, n_accum (auto-detect when not given). + + # num_data_inputs: detect from npz mb1 variants if not specified + num_data = args.num_data_inputs + if num_data is None: + num_data = _infer_num_data_inputs(inputs_path) + log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz") + + # n_steps / n_accum: derive from inputs.npz mini-batch count if not specified + n_steps = args.n_steps + n_accum = args.n_accum + if n_steps is None or n_accum is None: + total_mb = _infer_total_mb(inputs_path) + log.info(f"Auto-detected total_mb={total_mb} from inputs.npz") + if n_steps is None and n_accum is None: + n_accum = _infer_n_accum(inputs_path) + n_steps = max(1, total_mb // n_accum) + elif n_steps is None: + n_steps = max(1, total_mb // n_accum) + else: + n_accum = max(1, total_mb // n_steps) + + log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}") + + # 8. Build unique_mb_data from npz (only data_size unique samples). + # The C harness cycles through them via mb % TRAINING_DATA_SIZE. + total_mb = n_steps * n_accum + data_size = _infer_data_size(inputs_path) + log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}") + mb0_data = list(npz_base[:num_data]) + + unique_mb_data = [] + for mb in range(data_size): + if mb == 0: + unique_mb_data.append(mb0_data) + else: + mb_row = [] + for buf_idx in range(num_data): + key = f"mb{mb}_arr_{buf_idx:04d}" + mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx]) + unique_mb_data.append(mb_row) + + # Grad acc buf info for testinputs.h. + if grad_acc_set: + sorted_grad = sorted(grad_acc_set) + grad_buf_start_idx = sorted_grad[0] + else: + grad_buf_start_idx = -1 + num_grad_inputs = len(grad_acc_set) + + # Initial weight arrays: npz_base[num_data .. grad_buf_start_idx-1] + if grad_buf_start_idx > num_data: + init_weights = list(npz_base[num_data:grad_buf_start_idx]) + else: + init_weights = [] + + # 9. Load reference loss from outputs.npz. + reference_losses = _load_reference_losses(args.dir) + + # 10. Generate all output files + os.makedirs(args.dumpdir, exist_ok=True) + + generateTrainingTestNetwork(deployer, + unique_mb_data, + args.dumpdir, + verbosityCfg, + n_steps=n_steps, + n_accum=n_accum, + num_data_inputs=num_data, + grad_buf_start_idx=grad_buf_start_idx, + num_grad_inputs=num_grad_inputs, + learning_rate=args.learning_rate, + reference_losses=reference_losses, + init_weights=init_weights, + data_size=data_size, + tolerance_abs=args.tolerance_abs) + + # 11. Write resolved config for execution.py to pick up after subprocess call. + meta = { + "n_train_steps": n_steps, + "n_accum_steps": n_accum, + "training_num_data_inputs": num_data, + } + meta_path = os.path.join(args.dumpdir, "training_meta.json") + with open(meta_path, 'w') as f: + json.dump(meta, f, indent=2) + log.info(f"Training meta written to {meta_path}: {meta}") + + +if __name__ == '__main__': + + parser = TestGeneratorArgumentParser(description="Deeploy Training Code Generation Utility.") + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of cores on which the network is run. " + "Currently required for im2col buffer sizing on Siracusa. Default: 1.", + ) + parser.add_argument( + "--num-data-inputs", + type=int, + dest="num_data_inputs", + default=None, + help="Number of DATA inputs that change per mini-batch. " + "Auto-detected from ONNX graph if not specified.", + ) + parser.add_argument( + "--n-steps", + type=int, + dest="n_steps", + default=None, + help="N_TRAIN_STEPS: number of gradient-accumulation update steps. " + "Auto-detected from inputs.npz mini-batch count if not specified.", + ) + parser.add_argument( + "--n-accum", + type=int, + dest="n_accum", + default=None, + help="N_ACCUM_STEPS: number of mini-batches per update step. " + "Auto-detected from inputs.npz mini-batch count if not specified.", + ) + parser.add_argument( + "--learning-rate", + type=float, + dest="learning_rate", + default=0.001, + help="SGD learning rate emitted as TRAINING_LEARNING_RATE in testinputs.h. Default: 0.001.", + ) + parser.add_argument( + "--tolerance", + type=float, + dest="tolerance_abs", + default=1e-3, + help="Absolute loss tolerance emitted as TRAINING_TOLERANCE_ABS in testoutputs.h. Default: 1e-3.", + ) + parser.add_argument('--shouldFail', action='store_true') + parser.set_defaults(shouldFail=False) + + args = parser.parse_args() + + try: + generateTrainingNetwork(args) + except Exception as e: + if args.shouldFail: + print("\033[92mTraining network generation ended, failed as expected!\033[0m") + sys.exit(0) + else: + raise e + + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py new file mode 100644 index 0000000000..9e29d79c55 --- /dev/null +++ b/DeeployTest/testMVPOptimizer.py @@ -0,0 +1,236 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Tiled optimizer network code-generation entry point. + +Loads the optimizer ONNX graph (containing Deeploy SGD nodes) and emits +OptimizerNetwork.c / OptimizerNetwork.h into the specified output directory, +using the SB-Tiler to tile SGD kernels through L1. + +The generated code uses the prefix ``DeeployOptNetwork_`` (instead of the +default ``DeeployNetwork_``) so that it can be linked together with the +training network without symbol conflicts. + +Usage +----- + /usr/bin/python testMVPOptimizer.py \\ + -t \\ # directory containing network.onnx + -d \\ # where to write OptimizerNetwork.c/h + -p Siracusa \\ + --cores 8 \\ + --l1 64000 \\ + --l2 1024000 \\ + --defaultMemLevel L2 +""" + +import hashlib +import os +import sys +from pathlib import Path +from typing import List + +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerate import build_shared_buffer_maps, generateOptimizerTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.tilingUtils import TrainingSBTiler + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import CodeGenVerbosity, _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ + AnnotateIOMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + + +def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]: + """Wrap every node in a singleton list for the Tiler pattern interface.""" + return [[node] for node in graph.nodes] + + +def generateTiledOptimizerNetwork(args) -> None: + log.debug("Arguments: %s", args) + + # 1. Load optimizer network.onnx + onnx_path = f'{args.dir}/network.onnx' + onnx_model = onnx.load_model(onnx_path) + graph = gs.import_onnx(onnx_model) + + log.debug(f"Optimizer ONNX inputs: {[i.name for i in onnx_model.graph.input]}") + log.debug(f"Optimizer ONNX outputs: {[o.name for o in onnx_model.graph.output]}") + + # 2. Platform setup + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [e for e in platform.engines if isinstance(e, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 3. All optimizer inputs are float32 (weights + grad acc buffers). + graph_input_names = [inp.name for inp in onnx_model.graph.input] + inputTypes = {f"input_{i}": PointerClass(float32_t) for i in range(len(graph_input_names))} + inputOffsets = {f"input_{i}": 0 for i in range(len(graph_input_names))} + + # 4. Create deployer with _mockScheduler (required for TilerDeployerWrapper). + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates_optimizer") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name="DeeployOptimizerNetwork", + deeployStateDir=_DEEPLOYSTATEDIR, + inputOffsets=inputOffsets, + scheduler=_mockScheduler) + + # 5. Set up memory hierarchy. + # Tiles execute in L1; optimizer I/O (weights, grads) live in L2 (or L3). + L3 = MemoryLevel(name="L3", neighbourNames=["L2"], size=64_000_000) + L2 = MemoryLevel(name="L2", neighbourNames=["L3", "L1"], size=args.l2) + L1 = MemoryLevel(name="L1", neighbourNames=["L2"], size=args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + + defaultTargetMemLevel = L1 + defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + # 6. Wrap with memory-level annotation. + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel) + deployer = MemoryDeployerWrapper(deployer, [ + AnnotateIOMemoryLevel(defaultIoMemLevel.name), + AnnotateDefaultMemoryLevel(memoryHierarchy), + ]) + + # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer" + testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] + + # TrainingSBTiler extends all input buffer lifetimes to the end of the + # schedule (via TrainingMemoryScheduler). This prevents the allocator from + # reusing the space of a consumed input (e.g. fc1 weight) for a later + # output (e.g. fc2 updated weight), which would corrupt the weight buffer. + deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName=testIdentifier, workDir=args.dumpdir) + deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc + deployer.tiler.memoryAllocStrategy = args.memAllocStrategy + deployer.tiler.searchStrategy = args.searchStrategy + + # 8. Prepare deployer. + verbosityCfg = _NoVerbosity + if args.profileTiling: + verbosityCfg = CodeGenVerbosity(tilingProfiling=True) + _ = deployer.prepare(verbosityCfg) + + # 9. Build shared-buffer maps when the training ONNX is available + shared_input_map: dict = {} + shared_output_map: dict = {} + training_onnx = Path(args.training_dir) / "network.onnx" if args.training_dir else None + if training_onnx and training_onnx.exists(): + shared_input_map, shared_output_map = build_shared_buffer_maps(str(training_onnx), onnx_model) + log.debug(f"[SharedBuffers] input map: {shared_input_map}") + log.debug(f"[SharedBuffers] output map: {shared_output_map}") + log.info(f"[TiledOptimizerNetwork] Sharing {len(shared_input_map)} inputs and " + f"{len(shared_output_map)} outputs with TrainingNetwork") + else: + if args.training_dir: + log.warning(f"[TiledOptimizerNetwork] training_dir set but {training_onnx} not found — " + "generating standalone OptimizerNetwork (no buffer sharing)") + + # 10. Generate OptimizerNetwork.c / OptimizerNetwork.h + os.makedirs(args.dumpdir, exist_ok=True) + generateOptimizerTestNetwork(deployer, args.dumpdir, verbosityCfg, shared_input_map, shared_output_map) + + log.info(f"Tiled optimizer network code generated in: {args.dumpdir}") + print(f"[TiledOptimizerNetwork] Generated OptimizerNetwork.c/h in {args.dumpdir}") + + +if __name__ == '__main__': + + parser = TestGeneratorArgumentParser(description="Deeploy Tiled Optimizer Network Code Generation.") + + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of cluster cores. Default: 1.", + ) + parser.add_argument( + "--lr", + type=float, + default=0.001, + help="Learning rate (informational only; embedded in optimizer ONNX attributes). Default: 0.001.", + ) + parser.add_argument( + '--l1', + type=int, + dest='l1', + default=64_000, + help='L1 size in bytes. Default: 64000.', + ) + parser.add_argument( + '--l2', + type=int, + dest='l2', + default=1_024_000, + help='L2 size in bytes. Default: 1024000.', + ) + parser.add_argument( + '--defaultMemLevel', + type=str, + dest='defaultMemLevel', + default="L2", + help='Default memory level for optimizer I/O buffers (L2 or L3). Must match the training graph. Default: L2.', + ) + parser.add_argument( + '--memAllocStrategy', + type=str, + dest='memAllocStrategy', + default="MiniMalloc", + help='Memory allocation strategy. Default: MiniMalloc.', + ) + parser.add_argument( + '--searchStrategy', + type=str, + dest='searchStrategy', + default="random-max", + help='CP solver search strategy. Default: random-max.', + ) + parser.add_argument( + '--plotMemAlloc', + action='store_true', + help='Save memory allocation plots in the deeployStates folder.', + ) + parser.add_argument( + '--profileTiling', + action='store_true', + help='Enable tiling profiling (inserts cycle counters around each tiled kernel).', + ) + parser.add_argument( + "--training-dir", + type=str, + default=None, + help="Directory containing the training network.onnx. When provided, " + "weight and grad-acc buffers are shared with TrainingNetwork instead " + "of being allocated independently.", + ) + parser.add_argument('--shouldFail', action='store_true') + parser.set_defaults(shouldFail=False) + + args = parser.parse_args() + + try: + generateTiledOptimizerNetwork(args) + except Exception as e: + if args.shouldFail: + print("\033[92mTiled optimizer network generation ended, failed as expected!\033[0m") + sys.exit(0) + else: + raise e + + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py new file mode 100644 index 0000000000..30b23dd1e3 --- /dev/null +++ b/DeeployTest/testMVPTraining.py @@ -0,0 +1,421 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +import hashlib +import json +import os +import sys +from typing import List + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +from testUtils.codeGenerate import generateTrainingTestNetwork +from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform +from testUtils.testRunner import TestGeneratorArgumentParser +from testUtils.tilingUtils import TrainingSBTiler +from testUtils.typeMapping import inferTypeAndOffset + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t, uint8_t +from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkDeployer, _NoVerbosity +from Deeploy.Logging import DEFAULT_LOGGER as log +from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel +from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper +from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ + AnnotateIOMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine +from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper + +_GRAD_ACC = "_grad.accumulation.buffer" + + +# --------------------------------------------------------------------------- +# Helpers copied from generateTrainingNetwork.py +# --------------------------------------------------------------------------- + +def _load_reference_losses(train_dir: str) -> list: + """Load reference loss values from outputs.npz.""" + outputs_path = os.path.join(train_dir, "outputs.npz") + if not os.path.exists(outputs_path): + log.warning(f"outputs.npz not found at {outputs_path} — loss comparison skipped") + return None + try: + outputs = np.load(outputs_path) + except Exception as e: + log.warning(f"Failed to load outputs.npz: {e} — loss comparison skipped") + return None + for key in outputs.files: + if 'loss' in key.lower(): + vals = [float(v) for v in np.array(outputs[key]).flatten().tolist()] + log.info(f"Reference losses loaded from outputs.npz['{key}']: {vals}") + return vals + log.warning("No 'loss' key found in outputs.npz — loss comparison skipped") + return None + + +def _infer_num_data_inputs(inputs_path: str) -> int: + inputs = np.load(inputs_path) + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + count = sum(1 for k in base_keys if f'mb1_{k}' in inputs.files) + if count == 0: + raise ValueError( + "Cannot auto-detect num_data_inputs: inputs.npz has only one mini-batch " + "(no mb1_arr_* entries found). Please pass --num-data-inputs explicitly.") + return count + + +def _infer_total_mb(inputs_path: str) -> int: + inputs = np.load(inputs_path) + if "meta_n_batches" in inputs.files: + return int(inputs["meta_n_batches"].flat[0]) + mb_indices = set() + for key in inputs.files: + if key.startswith('mb'): + try: + idx = int(key.split('_')[0][2:]) + mb_indices.add(idx) + except ValueError: + pass + return 1 + len(mb_indices) + + +def _infer_data_size(inputs_path: str) -> int: + inputs = np.load(inputs_path) + if "meta_data_size" in inputs.files: + return int(inputs["meta_data_size"].flat[0]) + return _infer_total_mb(inputs_path) + + +def _infer_n_accum(inputs_path: str) -> int: + inputs = np.load(inputs_path) + if "meta_n_accum" in inputs.files: + return int(inputs["meta_n_accum"].flat[0]) + return 1 + + +# --------------------------------------------------------------------------- +# Mock scheduler (same as testMVP.py) +# --------------------------------------------------------------------------- + +def _mockScheduler(graph: gs.Graph) -> List[List[gs.Node]]: + """Wrap every node in a singleton list for the Tiler pattern interface.""" + return [[node] for node in graph.nodes] + + +# --------------------------------------------------------------------------- +# Main generation function +# --------------------------------------------------------------------------- + +def generateTiledTrainingNetwork(args) -> None: + log.debug("Arguments: %s", args) + + # 1. Load network.onnx (training graph with forward + backward ops). + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') + graph = gs.import_onnx(onnx_graph) + + # 1a. Strip UNDEFINED-typed unused optional outputs (e.g. MaxPool mask indices). + _stripped = False + for node in graph.nodes: + filtered = [out for out in node.outputs if not (out.dtype == 0 and len(out.outputs) == 0)] + if len(filtered) < len(node.outputs): + node.outputs = filtered + _stripped = True + if _stripped: + graph.cleanup() + log.debug("Stripped UNDEFINED-typed unused optional outputs from graph nodes") + + # 2. Load inputs.npz. + inputs_path = f'{args.dir}/inputs.npz' + inputs = np.load(inputs_path) + + # 3. Platform setup. + platform, signProp = mapPlatform(args.platform) + log.debug(f"Platform: {platform} (sign: {signProp})") + + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + + # 4. Identify grad acc buf positions in the ONNX graph. + graph_input_names = [inp.name for inp in onnx_graph.graph.input] + grad_acc_set = {i for i, n in enumerate(graph_input_names) if _GRAD_ACC in n} + non_grad_indices = [i for i in range(len(graph_input_names)) if i not in grad_acc_set] + + base_keys = sorted(k for k in inputs.files if not k.startswith('mb') and not k.startswith('meta_')) + npz_base = [inputs[k] for k in base_keys] + + if len(npz_base) != len(non_grad_indices): + raise ValueError( + f"inputs.npz has {len(npz_base)} base entries but network.onnx has " + f"{len(non_grad_indices)} non-grad-buf inputs. " + f"Re-generate inputs.npz with the updated exporter.") + + # 5. Build inputTypes / inputOffsets for ALL graph input positions. + inputTypes = {} + inputOffsets = {} + + npz_idx = 0 + for graph_idx, name in enumerate(graph_input_names): + if graph_idx in grad_acc_set: + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + else: + arr = npz_base[npz_idx] + npz_idx += 1 + if arr.dtype == bool or arr.dtype == np.bool_: + inputTypes[f"input_{graph_idx}"] = PointerClass(uint8_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif arr.dtype in (np.float32, np.float64): + inputTypes[f"input_{graph_idx}"] = PointerClass(float32_t) + inputOffsets[f"input_{graph_idx}"] = 0 + elif np.prod(arr.shape) == 0: + pass + else: + values = arr.reshape(-1).astype(np.float32) + _type, offset = inferTypeAndOffset(values, signProp=False) + inputTypes[f"input_{graph_idx}"] = _type + inputOffsets[f"input_{graph_idx}"] = offset + + # 6. Create deployer with _mockScheduler (required for TilerDeployerWrapper). + _DEEPLOYSTATEDIR = os.path.join(args.dumpdir, "deeployStates") + + deployer = mapDeployer(platform, + graph, + inputTypes, + name="DeeployTrainingNetwork", + deeployStateDir=_DEEPLOYSTATEDIR, + inputOffsets=inputOffsets, + scheduler=_mockScheduler) + + # 7. Set up memory hierarchy. + L3 = MemoryLevel(name="L3", neighbourNames=["L2"], size=64_000_000) + L2 = MemoryLevel(name="L2", neighbourNames=["L3", "L1"], size=args.l2) + L1 = MemoryLevel(name="L1", neighbourNames=["L2"], size=args.l1) + memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel) + + defaultTargetMemLevel = L1 + defaultIoMemLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel] + + # 8. Wrap with memory-level annotation. + deployer.Platform = setupMemoryPlatform(deployer.Platform, memoryHierarchy, defaultTargetMemLevel) + + deployer = MemoryDeployerWrapper(deployer, [ + AnnotateIOMemoryLevel(defaultIoMemLevel.name), + AnnotateDefaultMemoryLevel(memoryHierarchy), + ]) + + # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}" + testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] + + deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName=testIdentifier, workDir=args.dumpdir) + deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc + deployer.tiler.memoryAllocStrategy = args.memAllocStrategy + deployer.tiler.searchStrategy = args.searchStrategy + + # 10. Prepare deployer. + verbosityCfg = _NoVerbosity + if args.profileTiling: + verbosityCfg = CodeGenVerbosity(tilingProfiling = True) + _ = deployer.prepare(verbosityCfg) + + # 11. Resolve num_data_inputs, n_steps, n_accum. + num_data = args.num_data_inputs + if num_data is None: + num_data = _infer_num_data_inputs(inputs_path) + log.info(f"Auto-detected num_data_inputs={num_data} from inputs.npz") + + n_steps = args.n_steps + n_accum = args.n_accum + if n_steps is None or n_accum is None: + total_mb = _infer_total_mb(inputs_path) + log.info(f"Auto-detected total_mb={total_mb} from inputs.npz") + if n_steps is None and n_accum is None: + n_accum = _infer_n_accum(inputs_path) + n_steps = max(1, total_mb // n_accum) + elif n_steps is None: + n_steps = max(1, total_mb // n_accum) + else: + n_accum = max(1, total_mb // n_steps) + + log.info(f"Training config: n_steps={n_steps} n_accum={n_accum} num_data_inputs={num_data}") + + # 12. Build unique_mb_data from npz. + total_mb = n_steps * n_accum + data_size = _infer_data_size(inputs_path) + log.info(f"Data cycling: data_size={data_size}, total_mb={total_mb}") + mb0_data = list(npz_base[:num_data]) + + unique_mb_data = [] + for mb in range(data_size): + if mb == 0: + unique_mb_data.append(mb0_data) + else: + mb_row = [] + for buf_idx in range(num_data): + key = f"mb{mb}_arr_{buf_idx:04d}" + mb_row.append(inputs[key] if key in inputs else mb0_data[buf_idx]) + unique_mb_data.append(mb_row) + + # Grad acc buf info for testinputs.h. + if grad_acc_set: + sorted_grad = sorted(grad_acc_set) + grad_buf_start_idx = sorted_grad[0] + else: + grad_buf_start_idx = -1 + num_grad_inputs = len(grad_acc_set) + + if grad_buf_start_idx > num_data: + init_weights = list(npz_base[num_data:grad_buf_start_idx]) + else: + init_weights = [] + + # 13. Load reference losses. + reference_losses = _load_reference_losses(args.dir) + + # 14. Generate output files. + os.makedirs(args.dumpdir, exist_ok=True) + + generateTrainingTestNetwork(deployer, + unique_mb_data, + args.dumpdir, + verbosityCfg, + n_steps=n_steps, + n_accum=n_accum, + num_data_inputs=num_data, + grad_buf_start_idx=grad_buf_start_idx, + num_grad_inputs=num_grad_inputs, + learning_rate=args.learning_rate, + reference_losses=reference_losses, + init_weights=init_weights, + data_size=data_size, + tolerance_abs=args.tolerance_abs) + + # 15. Write resolved config for execution.py to pick up. + meta = { + "n_train_steps": n_steps, + "n_accum_steps": n_accum, + "training_num_data_inputs": num_data, + } + meta_path = os.path.join(args.dumpdir, "training_meta.json") + with open(meta_path, 'w') as f: + json.dump(meta, f, indent=2) + log.info(f"Training meta written to {meta_path}: {meta}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +if __name__ == '__main__': + + parser = TestGeneratorArgumentParser(description="Deeploy Tiled Training Code Generation Utility.") + + # Training params (same as generateTrainingNetwork.py) + parser.add_argument( + "--cores", + type=int, + default=1, + help="Number of cores on which the network is run. Default: 1.", + ) + parser.add_argument( + "--num-data-inputs", + type=int, + dest="num_data_inputs", + default=None, + help="Number of DATA inputs that change per mini-batch. Auto-detected if not specified.", + ) + parser.add_argument( + "--n-steps", + type=int, + dest="n_steps", + default=None, + help="N_TRAIN_STEPS: number of gradient-accumulation update steps.", + ) + parser.add_argument( + "--n-accum", + type=int, + dest="n_accum", + default=None, + help="N_ACCUM_STEPS: number of mini-batches per update step.", + ) + parser.add_argument( + "--learning-rate", + type=float, + dest="learning_rate", + default=0.001, + help="SGD learning rate emitted as TRAINING_LEARNING_RATE in testinputs.h. Default: 0.001.", + ) + + # Tiling params (same as testMVP.py) + parser.add_argument( + '--l1', + type=int, + dest='l1', + default=64_000, + help='Set L1 size in bytes. Default: 64000.', + ) + parser.add_argument( + '--l2', + type=int, + dest='l2', + default=1_024_000, + help='Set L2 size in bytes. Default: 1024000.', + ) + parser.add_argument( + '--defaultMemLevel', + type=str, + dest='defaultMemLevel', + default="L2", + help='Default memory level for IO buffers. Default: L2.', + ) + parser.add_argument( + '--memAllocStrategy', + type=str, + dest='memAllocStrategy', + default="MiniMalloc", + help='Memory allocation strategy. Default: MiniMalloc.', + ) + parser.add_argument( + '--searchStrategy', + type=str, + dest='searchStrategy', + default="random-max", + help='CP solver search strategy. Default: random-max.', + ) + parser.add_argument( + '--plotMemAlloc', + action='store_true', + help='Save memory allocation plots in the deeployStates folder.', + ) + parser.add_argument( + '--profileTiling', + action='store_true', + help='Enable tiling profiling (inserts cycle counters around each tiled kernel).', + ) + parser.add_argument( + '--tolerance', + type=float, + dest='tolerance_abs', + default=1e-3, + help='Absolute loss tolerance emitted as TRAINING_TOLERANCE_ABS in testoutputs.h. Default: 1e-3.', + ) + parser.add_argument('--shouldFail', action='store_true') + parser.set_defaults(shouldFail=False) + + args = parser.parse_args() + + try: + generateTiledTrainingNetwork(args) + except Exception as e: + if args.shouldFail: + print("\033[92mTiled training network generation ended, failed as expected!\033[0m") + sys.exit(0) + else: + raise e + + if args.shouldFail: + raise RuntimeError("Expected to fail!") diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py index 39a44d9442..ea73d320e1 100644 --- a/DeeployTest/testUtils/codeGenerate.py +++ b/DeeployTest/testUtils/codeGenerate.py @@ -3,7 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import List, Tuple +import re +from pathlib import Path +from typing import Dict, List, Optional, Tuple import numpy as np @@ -194,6 +196,15 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: C """ retStr += deployer.generateEngineInitializationCode() retStr += deployer.generateBufferAllocationCode() + + # Initialize all output buffers to zero + output_idx = 0 + while deployer.ctxt.is_buffer(f'output_{output_idx}'): + output_buffer = deployer.ctxt.lookup(f'output_{output_idx}') + output_size = np.prod(output_buffer.shape) if hasattr(output_buffer, 'shape') else output_buffer._type.referencedType.typeWidth + typeName = output_buffer._type.referencedType.typeName + output_idx += 1 + retStr += """ } """ @@ -287,3 +298,865 @@ def generateTestNetwork(deployer: NetworkDeployer, test_inputs: List[np.ndarray] os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/Network.h') os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/testoutputs.h') os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/testinputs.h') + + +# --------------------------------------------------------------------------- +# Training code-generation helpers +# --------------------------------------------------------------------------- + + +def generateTrainingTestInputsHeader(deployer: NetworkDeployer, all_mb_data: List[List[np.ndarray]], n_steps: int, + n_accum: int, grad_buf_start_idx: int = 0, num_grad_inputs: int = 0, + learning_rate: float = 0.001, init_weights: List[np.ndarray] = None, + data_size: int = None) -> str: + """Generate testinputs.h for training tests. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer (used to look up buffer types). + all_mb_data : list of list of np.ndarray + Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for + mini-batch *mb* and DATA buffer *buf*. All mini-batches must have the + same number of buffers. + n_steps : int + N_TRAIN_STEPS macro value. + n_accum : int + N_ACCUM_STEPS macro value. + grad_buf_start_idx : int + Index of the first grad accumulation buffer in DeeployNetwork_inputs[]. + Used to emit TRAINING_GRAD_BUF_START_IDX. Pass 0 (and num_grad_inputs=0) + to suppress the define (e.g. when no grad bufs exist). + num_grad_inputs : int + Number of grad accumulation buffers. Used to emit TRAINING_NUM_GRAD_INPUTS. + + Returns + ------- + str + C header string. + """ + total_mb = n_steps * n_accum + num_data = len(all_mb_data[0]) if all_mb_data else 0 + # data_size: number of unique samples stored in C arrays. + # C harness cycles: testDataVector[mb % TRAINING_DATA_SIZE]. + # Defaults to total_mb (no cycling) for backward compatibility. + effective_data_size = data_size if (data_size is not None and data_size < total_mb) else total_mb + + retStr = "" + retStr += f"#define N_TRAIN_STEPS {n_steps}\n" + retStr += f"#define N_ACCUM_STEPS {n_accum}\n" + retStr += f"#define TRAINING_DATA_SIZE {effective_data_size}\n" + retStr += f"#define TRAINING_NUM_DATA_INPUTS {num_data}\n" + if num_grad_inputs > 0: + retStr += f"#define TRAINING_GRAD_BUF_START_IDX {grad_buf_start_idx}\n" + retStr += f"#define TRAINING_NUM_GRAD_INPUTS {num_grad_inputs}\n" + num_weight_inputs = grad_buf_start_idx - num_data + retStr += f"#define TRAINING_NUM_WEIGHT_INPUTS {num_weight_inputs}\n" + retStr += f"#define TRAINING_LEARNING_RATE {learning_rate:.10g}f\n" + retStr += "\n" + + # Emit per-mini-batch buffer arrays — only effective_data_size unique rows. + # all_mb_data must contain exactly effective_data_size rows. + for mb in range(effective_data_size): + mb_data = all_mb_data[mb] if mb < len(all_mb_data) else all_mb_data[-1] + row_entries = [] + for buf_idx, arr in enumerate(mb_data): + values = arr.reshape(-1) + + # Determine C type from deployer context (buffer "input_N"). + input_key = f"input_{buf_idx}" + if deployer.ctxt.is_buffer(input_key): + buffer = deployer.ctxt.lookup(input_key) + typeName = buffer._type.referencedType.typeName + typeWidth = buffer._type.referencedType.typeWidth + else: + # Fallback: infer from numpy dtype + if arr.dtype == np.float32 or arr.dtype == np.float64: + typeName = "float32_t" + typeWidth = 32 + elif arr.dtype == np.int64: + typeName = "int64_t" + typeWidth = 64 + elif arr.dtype == np.bool_ or arr.dtype == bool: + typeName = "uint8_t" + typeWidth = 8 + else: + typeName = "int32_t" + typeWidth = 32 + + buf_name = f"testData_mb{mb}_buf{buf_idx}" + row_entries.append(buf_name) + + # Format values + if typeName == 'float32_t': + list_str = ", ".join([ + f'{float(x)}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values.astype(np.float32) + ]) + else: + list_str = ", ".join([str(x) for x in values]) + + # 4-byte alignment padding + total_bytes = (values.size * typeWidth) // 8 + pad_bytes = (-total_bytes) % 4 + if pad_bytes: + paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth + list_str += ", " + ", ".join("0" for _ in range(paddingElements)) + + retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n" + + # Emit the row pointer array for this mini-batch + row_name = f"testDataRow{mb}" + retStr += f"void* {row_name}[] = {{{', '.join(f'(void*){e}' for e in row_entries)}}};\n" + retStr += "\n" + + # Emit the top-level vector of row pointers (only unique samples; C harness cycles via modulo). + retStr += f"void** testDataVector[{effective_data_size}] = {{{', '.join(f'testDataRow{mb}' for mb in range(effective_data_size))}}};\n" + + # Emit initial weight arrays (one per weight input, indices num_data..grad_buf_start_idx-1). + if init_weights: + retStr += "\n" + weight_entries = [] + num_data = len(all_mb_data[0]) if all_mb_data else 0 + for wi, arr in enumerate(init_weights): + buf_global_idx = num_data + wi + input_key = f"input_{buf_global_idx}" + if deployer.ctxt.is_buffer(input_key): + buffer = deployer.ctxt.lookup(input_key) + typeName = buffer._type.referencedType.typeName + typeWidth = buffer._type.referencedType.typeWidth + else: + typeName = "float32_t" + typeWidth = 32 + values = arr.reshape(-1).astype(np.float32) + # Tile values to match Deeploy's internal (possibly sequence-length-tiled) shape. + if deployer.ctxt.is_buffer(input_key): + expected_nelems = int(np.prod(deployer.ctxt.lookup(input_key).shape)) + if expected_nelems > len(values) and expected_nelems % len(values) == 0: + values = np.tile(values, expected_nelems // len(values)) + list_str = ", ".join([f'{float(x)}f' for x in values]) + buf_name = f"testInitWeight_{wi}" + weight_entries.append(buf_name) + retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n" + retStr += f"void* testInitWeights[{len(weight_entries)}] = {{{', '.join(f'(void*){e}' for e in weight_entries)}}};\n" + + return retStr + + +def generateTrainingTestOutputsHeader( + reference_losses: List = None, + tolerance_abs: float = 1e-3, +) -> str: + """Generate testoutputs.h for training tests — loss comparison only. + + Parameters + ---------- + reference_losses : list of float, optional + Reference loss value for each forward pass (one per mini-batch step). + If None, loss comparison is skipped. + tolerance_abs : float + Absolute comparison tolerance emitted as TRAINING_TOLERANCE_ABS. + + Returns + ------- + str + C header string. + """ + has_loss = reference_losses is not None and len(reference_losses) > 0 + + retStr = "// testoutputs.h — Phase 2: loss verification\n" + retStr += f"#define TRAINING_TOLERANCE_ABS {tolerance_abs:.10g}f\n\n" + + if has_loss: + n = len(reference_losses) + retStr += "// Expected loss for each forward pass (one per mini-batch)\n" + retStr += f"#define N_LOSS_REFS {n}\n" + vals = ", ".join(f"{float(v):.10g}f" for v in reference_losses) + retStr += f"float32_t testLossRef[{n}] = {{{vals}}};\n\n" + else: + retStr += "// No loss reference available — loss comparison skipped.\n" + retStr += "#define N_LOSS_REFS 0\n\n" + + return retStr + + +def generateTrainingNetworkHeader(deployer: NetworkDeployer) -> str: + """Generate TrainingNetwork.h — same as generateTestNetworkHeader but with + RunTrainingNetwork / InitTrainingNetwork function names and a distinct header guard. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer. + + Returns + ------- + str + C header string. + """ + retStr = "" + + retStr += """ +#ifndef __DEEPLOY_TRAINING_HEADER__ +#define __DEEPLOY_TRAINING_HEADER__ +#include +#include +#include +""" + retStr += deployer.generateIncludeString() + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunTrainingNetwork(); +void InitTrainingNetwork(); + +""" + else: + retStr += """ +void RunTrainingNetwork(uint32_t core_id, uint32_t numThreads); +void InitTrainingNetwork(uint32_t core_id, uint32_t numThread); + +""" + + retStr += deployer.generateIOBufferInitializationCode() + retStr += """ +#endif +""" + + return retStr + + +def generateTrainingNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str: + """Generate TrainingNetwork.c — same as generateTestNetworkImplementation but with + RunTrainingNetwork / InitTrainingNetwork function names and including TrainingNetwork.h. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + + Returns + ------- + str + C implementation string. + """ + retStr = "" + + retStr += """#include +#include +#include +""" + retStr += deployer.generateIncludeString() + retStr += """ + +#include "TrainingNetwork.h" + +""" + + retStr += deployer.generateBufferInitializationCode() + retStr += deployer.generateGlobalDefinitionCode() + + if isinstance(deployer.Platform, MemPoolPlatform): + retStr += deployer.generateInferenceInitializationCode() + retStr += """ +void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunTrainingNetwork(){ +""" + retStr += deployer.generateInferenceInitializationCode() + else: + retStr += """ +void RunTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateInferenceInitializationCode() + + retStr += deployer.generateFunction(verbosityCfg) + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +} + +void InitTrainingNetwork(){ +""" + else: + retStr += """ +} + +void InitTrainingNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateEngineInitializationCode() + retStr += deployer.generateBufferAllocationCode() + retStr += """ +} +""" + + return retStr + + +def generateTrainingTestNetwork(deployer: NetworkDeployer, all_mb_data: List[List[np.ndarray]], dumpdir: str, + verbosityCfg: CodeGenVerbosity, n_steps: int = 1, n_accum: int = 1, + num_data_inputs: int = 2, grad_buf_start_idx: int = 0, num_grad_inputs: int = 0, + learning_rate: float = 0.001, reference_losses: List = None, + init_weights: List = None, data_size: int = None, + tolerance_abs: float = 1e-3) -> None: + """Generate all training test files: testinputs.h, testoutputs.h, TrainingNetwork.h, TrainingNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer (ctxt.name must already be set to "DeeployTrainingNetwork"). + all_mb_data : list of list of np.ndarray + Per-mini-batch DATA arrays: ``all_mb_data[mb][buf]`` is the array for + mini-batch *mb* and DATA buffer *buf*. + dumpdir : str + Output directory for generated files. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + n_steps : int + N_TRAIN_STEPS value. + n_accum : int + N_ACCUM_STEPS value. + num_data_inputs : int + Number of data inputs (TRAINING_NUM_DATA_INPUTS). + grad_buf_start_idx : int + Index of the first grad accumulation buffer in DeeployNetwork_inputs[]. + num_grad_inputs : int + Number of grad accumulation buffers (TRAINING_NUM_GRAD_INPUTS). + """ + assert deployer.prepared, "An unprepared deployer was given" + + os.makedirs(dumpdir, exist_ok=True) + + # testinputs.h + testInputStr = generateTrainingTestInputsHeader(deployer, all_mb_data, n_steps, n_accum, grad_buf_start_idx, + num_grad_inputs, learning_rate, init_weights=init_weights, + data_size=data_size) + with open(f'{dumpdir}/testinputs.h', 'w') as f: + f.write(testInputStr) + + # testoutputs.h + testOutputStr = generateTrainingTestOutputsHeader( + reference_losses=reference_losses, + tolerance_abs=tolerance_abs, + ) + with open(f'{dumpdir}/testoutputs.h', 'w') as f: + f.write(testOutputStr) + + # TrainingNetwork.h + headerStr = generateTrainingNetworkHeader(deployer) + with open(f'{dumpdir}/TrainingNetwork.h', 'w') as f: + f.write(headerStr) + + # TrainingNetwork.c + implStr = generateTrainingNetworkImplementation(deployer, verbosityCfg) + with open(f'{dumpdir}/TrainingNetwork.c', 'w') as f: + f.write(implStr) + + clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}" + for fname in ['TrainingNetwork.c', 'TrainingNetwork.h', 'testinputs.h', 'testoutputs.h']: + os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}') + + # Build initial-value list for every input_N buffer so that L3 hex files + # can be written. The list must cover all N where "input_N" exists in the + # deployer context. Layout (must match DeeployNetwork_inputs[] order): + # [0 .. num_data_inputs-1] → first mini-batch data + # [num_data_inputs .. grad_start-1] → initial weights + # [grad_start .. grad_start+num_grad-1] → zeros (grad acc bufs) + # [last] → lazy_reset_grad = 1 (uint8) + l3_initial_inputs: List[np.ndarray] = [] + # Count how many input_N buffers exist in the deployer context + n_total_inputs = sum(1 for name in deployer.ctxt.globalObjects + if name.startswith("input_") and name[len("input_"):].isdigit()) + for i in range(n_total_inputs): + if all_mb_data and i < len(all_mb_data[0]): + # Data / label input + l3_initial_inputs.append(all_mb_data[0][i]) + elif (init_weights is not None and grad_buf_start_idx > 0 + and num_data_inputs <= i < grad_buf_start_idx): + # Weight input + wi = i - num_data_inputs + l3_initial_inputs.append(init_weights[wi] if wi < len(init_weights) else np.array([0.0], dtype=np.float32)) + elif (grad_buf_start_idx > 0 and num_grad_inputs > 0 + and grad_buf_start_idx <= i < grad_buf_start_idx + num_grad_inputs): + # Gradient accumulation buffer — zero-initialised + buf = deployer.ctxt.globalObjects.get(f"input_{i}") + shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,) + l3_initial_inputs.append(np.zeros(shape, dtype=np.float32)) + else: + # lazy_reset_grad (last input) or any unknown slot — default 1 / uint8 + buf = deployer.ctxt.globalObjects.get(f"input_{i}") + shape = buf.shape if (buf is not None and hasattr(buf, 'shape')) else (1,) + l3_initial_inputs.append(np.ones(shape, dtype=np.uint8)) + + generateL3HexDump(deployer, os.path.join(dumpdir, 'hex'), l3_initial_inputs, []) + + +# --------------------------------------------------------------------------- +# Optimizer network code-generation helpers +# --------------------------------------------------------------------------- + +_OPT_PREFIX = "DeeployOptNetwork_" +_TRAIN_PREFIX = "DeeployNetwork_" + + +def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict[int, int], Dict[int, int]]: + """Build optimizer→training index maps for tensors shared between the two graphs. + + The optimizer ONNX inputs are interleaved weight/grad pairs that have the + same tensor names as inputs in the training ONNX graph. We match by name + so that ``InitOptimizerNetwork`` can reference the already-allocated + ``DeeployNetwork_input_N`` pointers instead of allocating fresh buffers. + + Parameters + ---------- + train_onnx_path : str + Path to the training ``network.onnx``. + opt_onnx_model : + Already-loaded optimizer ONNX model (``onnx.ModelProto``). + + Returns + ------- + shared_input_map : Dict[int, int] + opt_input_idx → train_input_idx + shared_output_map : Dict[int, int] + opt_output_idx → train_input_idx (SGD outputs == updated weights, + same physical buffer as the weight input) + """ + import onnx as _onnx + train_model = _onnx.load_model(train_onnx_path) + train_names = [inp.name for inp in train_model.graph.input] + train_name_to_idx = {name: i for i, name in enumerate(train_names)} + + opt_input_names = [inp.name for inp in opt_onnx_model.graph.input] + opt_output_names = [out.name for out in opt_onnx_model.graph.output] + + shared_input_map: Dict[int, int] = {} + for opt_idx, name in enumerate(opt_input_names): + if name in train_name_to_idx: + shared_input_map[opt_idx] = train_name_to_idx[name] + + shared_output_map: Dict[int, int] = {} + for opt_idx, name in enumerate(opt_output_names): + # Try exact match first; then strip the '_updated' suffix that the SGD + # node appends to output tensor names (e.g. 'conv1_weight_updated' → 'conv1_weight'). + lookup_name = name + if lookup_name not in train_name_to_idx and lookup_name.endswith('_updated'): + lookup_name = lookup_name[: -len('_updated')] + if lookup_name in train_name_to_idx: + shared_output_map[opt_idx] = train_name_to_idx[lookup_name] + + return shared_input_map, shared_output_map + + +def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str: + """Redirect optimizer I/O buffers to Training's already-allocated buffers. + + Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that + the generated symbols already carry the ``DeeployOptNetwork_`` prefix. + + Handles two allocation styles produced by Deeploy: + + *Non-tiled* (per-buffer malloc):: + + DeeployOptNetwork_input_N = (SomeType *)pi_l2_malloc(sizeof(...)); + + *Tiled* (single arena with offsets):: + + DeeployOptNetwork_input_N = (float32_t *)((char *)DeeployOptNetwork_MEMORYARENA_L2 + OFFSET); + + Both are replaced with direct pointers into the TrainingNetwork arenas:: + + DeeployOptNetwork_input_N = (float32_t *)DeeployNetwork_input_M; + + After all I/O pointers are redirected, if a ``MEMORYARENA_L2`` or + ``MEMORYARENA_L3`` allocation is no longer referenced anywhere in the Init + body (i.e., the shared buffers consumed the entire arena), the now-unused + malloc is also removed to reclaim the L2/L3 memory. + + Parameters + ---------- + retStr : str + The already-prefix-substituted C source string. + shared_input_map : Dict[int, int] + Optimizer input index → training input index. + shared_output_map : Dict[int, int] + Optimizer output index → training input index (in-place update). + + Returns + ------- + str + Patched C source string. + """ + if not shared_input_map and not shared_output_map: + return retStr + + # ------------------------------------------------------------------ + # Pattern 1 (non-tiled): individual pi_*_malloc per buffer + # ------------------------------------------------------------------ + _malloc_pat = re.compile( + r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)\s*pi_\w+_malloc\([^;]+\);' + ) + + # ------------------------------------------------------------------ + # Pattern 2 (tiled): arena-offset assignment + # DeeployOptNetwork_input_N = (Type *)((char *)DeeployOptNetwork_MEMORYARENA_Lx + OFFSET); + # ------------------------------------------------------------------ + _arena_pat = re.compile( + r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)' + r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;' + ) + + def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]: + if kind == "input" and idx in shared_input_map: + train_idx = shared_input_map[idx] + return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */' + if kind == "output" and idx in shared_output_map: + train_idx = shared_output_map[idx] + return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */' + return None + + def _replace(m: re.Match) -> str: + replacement = _make_replacement(m.group(1), m.group(2), int(m.group(3))) + return replacement if replacement is not None else m.group(0) + + retStr = _malloc_pat.sub(_replace, retStr) + retStr = _arena_pat.sub(_replace, retStr) + + # ------------------------------------------------------------------ + # Arena elimination: if a MEMORYARENA_Lx is no longer used for any + # pointer arithmetic after the redirects, its malloc is dead and can + # be removed to reclaim L2/L3. The global declaration is left in + # place (harmless; the variable will be NULL at runtime). + # ------------------------------------------------------------------ + for level in ('L2', 'L3'): + arena_sym = f'DeeployOptNetwork_MEMORYARENA_{level}' + # Pattern for the malloc assignment line itself + malloc_line_pat = re.compile( + rf'[^\n]*{re.escape(arena_sym)}\s*=\s*\([^)]+\)\s*pi_\w+_malloc\([^;]+\);\s*\n' + ) + # Pattern for any use of the arena in pointer arithmetic: + # (char *)ARENA + OFFSET or (void *)ARENA etc. + arena_use_pat = re.compile( + rf'\(\s*(?:char|void|int8_t)\s*\*\s*\)\s*{re.escape(arena_sym)}' + ) + if not arena_use_pat.search(retStr): + # No remaining pointer arithmetic — the malloc is dead + retStr = malloc_line_pat.sub('', retStr) + + # ------------------------------------------------------------------ + # Inject TrainingNetwork header so DeeployNetwork_input_N symbols resolve + # ------------------------------------------------------------------ + retStr = retStr.replace( + '#include "OptimizerNetwork.h"', + '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"', + ) + return retStr + + +def _patch_shared_arenas(retStr: str, train_c_source: str) -> str: + """Redirect optimizer L1/L2 arena allocations to reuse training network's arenas. + + TrainingNetwork and OptimizerNetwork run strictly sequentially: RunTrainingNetwork() + completes before RunOptimizerNetwork() starts. Their L1/L2 tile-working arenas + therefore never overlap in time and can share the same physical memory. + + Only the L1 arena is shared: it is pure tile-compute scratch whose content is + dead after each kernel returns. The L2 arena is NOT shared because it may hold + persistent tensor data (weights, activations) at fixed offsets in non-tiled mode; + sharing it would let the optimizer's L2 staging buffers overwrite that data. + + Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution. + + Parameters + ---------- + retStr : str + The already-prefix-substituted C source string for the optimizer. + train_c_source : str + The full text of TrainingNetwork.c (used to confirm the arena symbols exist). + + Returns + ------- + str + Patched C source string. + """ + for level in ('L1',): + train_sym = f'DeeployNetwork_MEMORYARENA_{level}' + # Only alias if the training network actually has this arena + if train_sym not in train_c_source: + continue + + opt_sym = f'DeeployOptNetwork_MEMORYARENA_{level}' + opt_malloc_pat = re.compile( + rf'({re.escape(opt_sym)})\s*=\s*\([^)]+\)\s*\w+\(sizeof\([^)]+\)\s*\*\s*\d+\)\s*;' + ) + if not opt_malloc_pat.search(retStr): + continue + + replacement = f'{opt_sym} = (int8_t *){train_sym}; /* shared with TrainingNetwork */' + retStr = opt_malloc_pat.sub(replacement, retStr) + + # Inject TrainingNetwork header if not already present + # (_patch_shared_buffers may have already added it; guard against duplicates) + if '#include "TrainingNetwork.h"' not in retStr: + retStr = retStr.replace( + '#include "OptimizerNetwork.h"', + '#include "OptimizerNetwork.h"\n#include "TrainingNetwork.h"', + ) + + return retStr + + +def _ensure_training_l1_capacity(dumpdir: str, train_c_source: str, opt_alloc_code: str) -> str: + """Enlarge TrainingNetwork's L1 arena to cover the optimizer's L1 needs. + + Since the two networks share the same L1 arena, TrainingNetwork must allocate + at least max(train_L1, opt_L1) bytes. When the optimizer needs more L1 than + training (rare but possible, e.g. autoencoder), this function patches + TrainingNetwork.c and TrainingNetwork.h in-place and returns the updated + TrainingNetwork.c source string. + + Parameters + ---------- + dumpdir : str + Directory containing TrainingNetwork.c and TrainingNetwork.h. + train_c_source : str + Current content of TrainingNetwork.c. + opt_alloc_code : str + Optimizer buffer-allocation code after _TRAIN_PREFIX → _OPT_PREFIX + substitution (used to extract the optimizer's L1 size). + + Returns + ------- + str + (Possibly updated) TrainingNetwork.c source string. + """ + m_opt = re.search( + r'DeeployOptNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*(\d+)\)', + opt_alloc_code, + ) + if not m_opt: + return train_c_source + + opt_l1 = int(m_opt.group(1)) + + m_train = re.search( + r'(DeeployNetwork_MEMORYARENA_L1\s*=\s*\([^)]+\)\s*pmsis_l1_malloc\(sizeof\([^)]+\)\s*\*\s*)(\d+)(\))', + train_c_source, + ) + if not m_train: + return train_c_source + + train_l1 = int(m_train.group(2)) + if opt_l1 <= train_l1: + return train_c_source # Already large enough + + new_l1 = opt_l1 + + # Patch TrainingNetwork.c malloc size + train_c_new = train_c_source.replace( + m_train.group(0), + f'{m_train.group(1)}{new_l1}{m_train.group(3)}', + 1, + ) + train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c') + with open(train_c_path, 'w') as f: + f.write(train_c_new) + + # Patch TrainingNetwork.h _len constant + train_h_path = os.path.join(dumpdir, 'TrainingNetwork.h') + if os.path.exists(train_h_path): + train_h = open(train_h_path).read() + train_h_new = re.sub( + r'(DeeployNetwork_MEMORYARENA_L1_len\s*=\s*)\d+', + rf'\g<1>{new_l1}', + train_h, + ) + with open(train_h_path, 'w') as f: + f.write(train_h_new) + + return train_c_new + + +def generateOptimizerNetworkHeader(deployer: NetworkDeployer) -> str: + """Generate OptimizerNetwork.h. + + Reuses the Deeploy deployer's output and applies two transformations: + 1. Replace the buffer prefix ``DeeployNetwork_`` → ``DeeployOptNetwork_`` + 2. Inject ``RunOptimizerNetwork`` / ``InitOptimizerNetwork`` function declarations. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + + Returns + ------- + str + C header string. + """ + retStr = "" + retStr += """ +#ifndef __DEEPLOY_OPTIMIZER_HEADER__ +#define __DEEPLOY_OPTIMIZER_HEADER__ +#include +#include +#include +""" + retStr += deployer.generateIncludeString() + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunOptimizerNetwork(); +void InitOptimizerNetwork(); + +""" + else: + retStr += """ +void RunOptimizerNetwork(uint32_t core_id, uint32_t numThreads); +void InitOptimizerNetwork(uint32_t core_id, uint32_t numThreads); + +""" + retStr += deployer.generateIOBufferInitializationCode() + retStr += """ +#endif +""" + # Prefix substitution: all Deeploy-generated DeeployNetwork_ → DeeployOptNetwork_ + retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX) + return retStr + + +def generateOptimizerNetworkImplementation(deployer: NetworkDeployer, + verbosityCfg: CodeGenVerbosity, + shared_input_map: Optional[Dict[int, int]] = None, + shared_output_map: Optional[Dict[int, int]] = None, + train_c_source: Optional[str] = None) -> str: + """Generate OptimizerNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + shared_input_map : Dict[int, int], optional + Optimizer input index → training input index for shared weight/grad buffers. + When provided, those malloc calls are replaced with references to the + already-allocated TrainingNetwork buffers. + shared_output_map : Dict[int, int], optional + Optimizer output index → training input index for in-place shared outputs. + train_c_source : str, optional + Full text of TrainingNetwork.c. When provided, the optimizer's L1/L2 arena + malloc calls are replaced with direct pointers to the training arenas, + saving one L1 and one L2 allocation (safe because the two networks run + strictly sequentially). + + Returns + ------- + str + C implementation string. + """ + retStr = "" + retStr += """#include +#include +#include +""" + retStr += deployer.generateIncludeString() + retStr += """ +#include "OptimizerNetwork.h" + +""" + retStr += deployer.generateBufferInitializationCode() + retStr += deployer.generateGlobalDefinitionCode() + + if isinstance(deployer.Platform, MemPoolPlatform): + retStr += deployer.generateInferenceInitializationCode() + retStr += """ +void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +void RunOptimizerNetwork(){ +""" + retStr += deployer.generateInferenceInitializationCode() + else: + retStr += """ +void RunOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateInferenceInitializationCode() + + retStr += deployer.generateFunction(verbosityCfg) + + if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): + retStr += """ +} + +void InitOptimizerNetwork(){ +""" + else: + retStr += """ +} + +void InitOptimizerNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ +""" + retStr += deployer.generateEngineInitializationCode() + retStr += deployer.generateBufferAllocationCode() + retStr += """ +} +""" + # Prefix substitution + retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX) + # Replace malloc calls for shared weight/grad buffers with Training pointers + retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {}) + # Redirect optimizer L1/L2 arena mallocs to reuse training arenas + if train_c_source: + retStr = _patch_shared_arenas(retStr, train_c_source) + return retStr + + +def generateOptimizerTestNetwork(deployer: NetworkDeployer, + dumpdir: str, + verbosityCfg: CodeGenVerbosity, + shared_input_map: Optional[Dict[int, int]] = None, + shared_output_map: Optional[Dict[int, int]] = None) -> None: + """Generate OptimizerNetwork.h and OptimizerNetwork.c. + + Parameters + ---------- + deployer : NetworkDeployer + Prepared deployer for the optimizer ONNX graph. + dumpdir : str + Output directory for generated files. + verbosityCfg : CodeGenVerbosity + Verbosity configuration. + shared_input_map : Dict[int, int], optional + Optimizer input index → training input index for shared weight/grad buffers. + shared_output_map : Dict[int, int], optional + Optimizer output index → training input index for in-place shared outputs. + """ + assert deployer.prepared, "An unprepared deployer was given" + + os.makedirs(dumpdir, exist_ok=True) + + train_c_path = os.path.join(dumpdir, 'TrainingNetwork.c') + train_c_source: Optional[str] = None + if os.path.exists(train_c_path): + with open(train_c_path, 'r') as f: + train_c_source = f.read() + + # Enlarge training L1 arena if optimizer needs more (so unconditional L1 sharing is safe) + if train_c_source: + opt_alloc_preview = deployer.generateBufferAllocationCode().replace(_TRAIN_PREFIX, _OPT_PREFIX) + train_c_source = _ensure_training_l1_capacity(dumpdir, train_c_source, opt_alloc_preview) + + headerStr = generateOptimizerNetworkHeader(deployer) + with open(f'{dumpdir}/OptimizerNetwork.h', 'w') as f: + f.write(headerStr) + + implStr = generateOptimizerNetworkImplementation(deployer, verbosityCfg, shared_input_map, shared_output_map, + train_c_source) + with open(f'{dumpdir}/OptimizerNetwork.c', 'w') as f: + f.write(implStr) + + clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}" + for fname in ['OptimizerNetwork.c', 'OptimizerNetwork.h']: + os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/{fname}') diff --git a/DeeployTest/testUtils/core/config.py b/DeeployTest/testUtils/core/config.py index e932c23962..0ecf45d467 100644 --- a/DeeployTest/testUtils/core/config.py +++ b/DeeployTest/testUtils/core/config.py @@ -24,6 +24,14 @@ class DeeployTestConfig: gen_args: List[str] = None verbose: int = 0 debug: bool = False + training: bool = False + # None means "auto-detect from ONNX graph / inputs.npz during codegen" + n_train_steps: Optional[int] = None + n_accum_steps: Optional[int] = None + training_num_data_inputs: Optional[int] = None + # Directory containing the optimizer ONNX (network.onnx with SGD nodes). + # If None, defaults to /../simplemlp_optimizer when training=True. + optimizer_dir: Optional[str] = None def __post_init__(self): if self.cmake_args is None: diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..9aff13cede 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import json import os import shutil import subprocess @@ -14,10 +15,56 @@ from .output_parser import TestResult, parse_test_output +def _augment_path(env: dict) -> dict: + """Prepend gvsoc/llvm bin dirs to PATH based on installed env vars. + + The install dirs are already set as env vars (GVSOC_INSTALL_DIR, + LLVM_INSTALL_DIR) but their bin/ subdirectories may not be in PATH. + + If a virtual environment is active (VIRTUAL_ENV is set), its bin dir + is prepended so that shebang-invoked scripts (kconfigtool.py, gapy) + resolve python3 to the venv interpreter, which has kconfiglib. + Without this, /usr/bin/python3 would be picked up instead, which + lacks kconfiglib and causes CMake kconfig setup to fail. + """ + venv = env.get('VIRTUAL_ENV', '') + extra = [str(Path(venv) / 'bin')] if venv else ['/usr/bin'] + for var in ('GVSOC_INSTALL_DIR', 'LLVM_INSTALL_DIR'): + install_dir = env.get(var, '') + if install_dir: + bin_dir = str(Path(install_dir) / 'bin') + current = env.get('PATH', '').split(':') + if bin_dir not in current: + extra.append(bin_dir) + env['PATH'] = ':'.join(extra) + ':' + env.get('PATH', '') + return env + + +def _resolve_optimizer_dir(config: DeeployTestConfig) -> str: + """Return the optimizer ONNX directory for this config. + + Falls back to /../_optimizer if not explicitly set, + where is derived by replacing the '_train' suffix of the test + directory name with '_optimizer' (e.g. simplemlp_train → simplemlp_optimizer, + sleepconvit_train → sleepconvit_optimizer). + """ + if config.optimizer_dir: + return config.optimizer_dir + test_parent = Path(config.test_dir).parent + test_dir_name = Path(config.test_dir).name + optimizer_name = test_dir_name.replace("_train", "_optimizer") + return str(test_parent / optimizer_name) + + def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: """ Generate network code from ONNX model. + In training mode, generates both TrainingNetwork (fwd+bwd) and + OptimizerNetwork (SGD) into the same gen_dir. Auto-detected training + parameters (n_steps, n_accum, num_data_inputs) are written to + gen_dir/training_meta.json and read back into config after codegen. + Raises: RuntimeError: If network generation fails """ @@ -27,31 +74,175 @@ def generate_network(config: DeeployTestConfig, skip: bool = False) -> None: script_dir = Path(__file__).parent.parent.parent - if config.tiling: + if config.training and config.tiling: + # --- Tiled training: testMVPTraining.py (tiling pipeline + training init) --- + generation_script = script_dir / "testMVPTraining.py" + cmd = [ + sys.executable, + str(generation_script), + "-d", config.gen_dir, + "-t", config.test_dir, + "-p", config.platform, + ] + if config.n_train_steps is not None: + cmd.append(f"--n-steps={config.n_train_steps}") + if config.n_accum_steps is not None: + cmd.append(f"--n-accum={config.n_accum_steps}") + if config.training_num_data_inputs is not None: + cmd.append(f"--num-data-inputs={config.training_num_data_inputs}") + if config.verbose > 0: + cmd.append("-" + "v" * config.verbose) + if config.debug: + cmd.append("--debug") + cmd.extend(config.gen_args) + + log.debug(f"[Execution] Tiled training generation command: {' '.join(cmd)}") + result = subprocess.run(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(f"Tiled training network generation failed for {config.test_name}") + + # Read back auto-detected values written by testMVPTraining.py + meta_path = Path(config.gen_dir) / "training_meta.json" + if meta_path.exists(): + with open(meta_path) as f: + meta = json.load(f) + config.n_train_steps = meta["n_train_steps"] + config.n_accum_steps = meta["n_accum_steps"] + config.training_num_data_inputs = meta["training_num_data_inputs"] + log.info(f"[Execution] Training meta: {meta}") + + # --- Step 2: Tiled optimizer network (SGD via testMVPOptimizer.py) --- + opt_dir = _resolve_optimizer_dir(config) + opt_script = script_dir / "testMVPOptimizer.py" + + if not Path(opt_dir).exists(): + log.warning(f"Optimizer directory not found: {opt_dir} — skipping optimizer codegen") + elif not opt_script.exists(): + log.warning(f"testMVPOptimizer.py not found — skipping optimizer codegen") + else: + opt_cmd = [ + sys.executable, + str(opt_script), + "-d", config.gen_dir, + "-t", opt_dir, + "-p", config.platform, + f"--training-dir={config.test_dir}", + ] + _OPT_PASSTHROUGH = ("--cores", "--l1", "--l2", + "--defaultMemLevel", + "--memAllocStrategy", "--searchStrategy", + "--plotMemAlloc", "--profileTiling") + for arg in config.gen_args: + if any(arg.startswith(p) for p in _OPT_PASSTHROUGH): + opt_cmd.append(arg) + # If no --defaultMemLevel was passed through, default to L2 + if not any(arg.startswith("--defaultMemLevel") for arg in opt_cmd): + opt_cmd.append("--defaultMemLevel=L2") + if config.verbose > 0: + opt_cmd.append("-" + "v" * config.verbose) + + log.debug(f"[Execution] Tiled optimizer generation command: {' '.join(opt_cmd)}") + result = subprocess.run(opt_cmd, check=False) + if result.returncode != 0: + raise RuntimeError(f"Tiled optimizer network generation failed for {config.test_name}") + + return # early return — tiled training path complete + + elif config.training: + # --- Step 1: Training network (forward + backward + accumulation) --- + generation_script = script_dir / "generateTrainingNetwork.py" + cmd = [ + sys.executable, + str(generation_script), + "-d", config.gen_dir, + "-t", config.test_dir, + "-p", config.platform, + ] + # Only pass values when explicitly set; otherwise let the script auto-detect + if config.n_train_steps is not None: + cmd.append(f"--n-steps={config.n_train_steps}") + if config.n_accum_steps is not None: + cmd.append(f"--n-accum={config.n_accum_steps}") + if config.training_num_data_inputs is not None: + cmd.append(f"--num-data-inputs={config.training_num_data_inputs}") + + if config.verbose > 0: + cmd.append("-" + "v" * config.verbose) + if config.debug: + cmd.append("--debug") + cmd.extend(config.gen_args) + + log.debug(f"[Execution] Training generation command: {' '.join(cmd)}") + result = subprocess.run(cmd, check=False) + if result.returncode != 0: + raise RuntimeError(f"Training network generation failed for {config.test_name}") + + # Read back auto-detected values written by generateTrainingNetwork.py + meta_path = Path(config.gen_dir) / "training_meta.json" + if meta_path.exists(): + with open(meta_path) as f: + meta = json.load(f) + config.n_train_steps = meta["n_train_steps"] + config.n_accum_steps = meta["n_accum_steps"] + config.training_num_data_inputs = meta["training_num_data_inputs"] + log.info(f"[Execution] Training meta: {meta}") + + # --- Step 2: Optimizer network (SGD) --- + opt_dir = _resolve_optimizer_dir(config) + opt_script = script_dir / "generateOptimizerNetwork.py" + + if not Path(opt_dir).exists(): + log.warning(f"Optimizer directory not found: {opt_dir} — skipping optimizer codegen") + elif not opt_script.exists(): + log.warning(f"generateOptimizerNetwork.py not found — skipping optimizer codegen") + else: + opt_cmd = [ + sys.executable, + str(opt_script), + "-d", config.gen_dir, + "-t", opt_dir, + "-p", config.platform, + f"--training-dir={config.test_dir}", + ] + _OPT_PASSTHROUGH = ("--cores", "--l1", "--l2", "--defaultMemLevel") + for arg in config.gen_args: + if any(arg.startswith(p) for p in _OPT_PASSTHROUGH): + opt_cmd.append(arg) + if not any(arg.startswith("--defaultMemLevel") for arg in opt_cmd): + opt_cmd.append("--defaultMemLevel=L2") + if config.verbose > 0: + opt_cmd.append("-" + "v" * config.verbose) + + log.debug(f"[Execution] Optimizer generation command: {' '.join(opt_cmd)}") + result = subprocess.run(opt_cmd, check=False) + if result.returncode != 0: + raise RuntimeError(f"Optimizer network generation failed for {config.test_name}") + + return # early return — training path complete + + elif config.tiling: generation_script = script_dir / "testMVP.py" + cmd = [ + sys.executable, + str(generation_script), + "-d", config.gen_dir, + "-t", config.test_dir, + "-p", config.platform, + ] else: generation_script = script_dir / "generateNetwork.py" + cmd = [ + sys.executable, + str(generation_script), + "-d", config.gen_dir, + "-t", config.test_dir, + "-p", config.platform, + ] - cmd = [ - "python", - str(generation_script), - "-d", - config.gen_dir, - "-t", - config.test_dir, - "-p", - config.platform, - ] - - # Add verbosity flags if config.verbose > 0: cmd.append("-" + "v" * config.verbose) - - # Add debug flag if config.debug: cmd.append("--debug") - - # Add additional generation arguments cmd.extend(config.gen_args) log.debug(f"[Execution] Generation command: {' '.join(cmd)}") @@ -72,7 +263,6 @@ def configure_cmake(config: DeeployTestConfig) -> None: if cmake_cmd == "cmake" and shutil.which("cmake") is None: raise RuntimeError("CMake not found. Please install CMake or set CMAKE environment variable") - # Build CMake command cmd = [ cmake_cmd, f"-DTOOLCHAIN={config.toolchain}", @@ -102,11 +292,22 @@ def configure_cmake(config: DeeployTestConfig) -> None: else: cmd.append("-Dgvsoc_simulation=OFF") - # Last argument is the source directory + if config.training: + cmd.append("-DTRAINING=ON") + # Only add cmake defines when the values are known (after codegen) + if config.n_train_steps is not None: + cmd.append(f"-DN_TRAIN_STEPS={config.n_train_steps}") + if config.n_accum_steps is not None: + cmd.append(f"-DN_ACCUM_STEPS={config.n_accum_steps}") + if config.training_num_data_inputs is not None: + cmd.append(f"-DTRAINING_NUM_DATA_INPUTS={config.training_num_data_inputs}") + else: + cmd.append("-DTRAINING=OFF") + script_dir = Path(__file__).parent.parent.parent cmd.append(str(script_dir.parent)) - env = os.environ.copy() + env = _augment_path(os.environ.copy()) if config.verbose >= 3: env["VERBOSE"] = "1" @@ -162,44 +363,49 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: if config.simulator == 'none': raise RuntimeError("No simulator specified!") + env = _augment_path(os.environ.copy()) + if config.verbose >= 3: + env["VERBOSE"] = "1" + if config.simulator == 'host': - # Run binary directly binary_path = Path(config.build_dir) / "bin" / config.test_name cmd = [str(binary_path)] - else: - # Run via CMake target - cmake_cmd = os.environ.get("CMAKE", "cmake") - cmd = [ - cmake_cmd, - "--build", - config.build_dir, - "--target", - f"{config.simulator}_{config.test_name}", - ] - env = os.environ.copy() - if config.verbose >= 3: - env["VERBOSE"] = "1" + elif config.simulator == 'gvsoc': + cmake_cmd = os.environ.get("CMAKE", "cmake") + cmd = [cmake_cmd, "--build", config.build_dir, "--target", + f"gvsoc_{config.test_name}"] - if config.simulator == 'banshee': + elif config.simulator == 'banshee': if config.verbose == 1: env["BANSHEE_LOG"] = "warn" elif config.verbose == 2: env["BANSHEE_LOG"] = "info" elif config.verbose >= 3: env["BANSHEE_LOG"] = "debug" + cmake_cmd = os.environ.get("CMAKE", "cmake") + cmd = [cmake_cmd, "--build", config.build_dir, "--target", + f"{config.simulator}_{config.test_name}"] - log.debug(f"[Execution] Simulation command: {' '.join(cmd)}") + else: + cmake_cmd = os.environ.get("CMAKE", "cmake") + cmd = [cmake_cmd, "--build", config.build_dir, "--target", + f"{config.simulator}_{config.test_name}"] - result = subprocess.run(cmd, capture_output = True, text = True, env = env) + log.debug(f"[Execution] Simulation command: {' '.join(cmd)}") - if result.stdout: - print(result.stdout, end = '') - if result.stderr: - print(result.stderr, end = '', file = sys.stderr) + # Stream output in real-time (line-buffered) and capture for parsing. + proc = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, + text = True, env = env, bufsize = 1) + stdout_lines = [] + for line in proc.stdout: + print(line, end = '', flush = True) + stdout_lines.append(line) + proc.stdout.close() + proc.wait() + stdout_output = ''.join(stdout_lines) - # Parse output for error count and cycles - test_result = parse_test_output(result.stdout, result.stderr) + test_result = parse_test_output(stdout_output, '') if not test_result.success and test_result.error_count == -1: log.warning(f"Could not parse error count from output") @@ -213,16 +419,9 @@ def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: """ log.info(f"################## Testing {config.test_name} on {config.platform} Platform ##################") - # Step 1: Generate network generate_network(config, skip = skipgen) - - # Step 2: Configure CMake configure_cmake(config) - - # Step 3: Build binary build_binary(config) - - # Step 4: Run simulation result = run_simulation(config, skip = skipsim) return result diff --git a/DeeployTest/testUtils/deeployTrainingRunner.py b/DeeployTest/testUtils/deeployTrainingRunner.py new file mode 100644 index 0000000000..9ee4a64cf4 --- /dev/null +++ b/DeeployTest/testUtils/deeployTrainingRunner.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +""" +Common entry point for Siracusa training test runners (non-tiled and tiled). + +Usage: + from testUtils.deeployTrainingRunner import main + sys.exit(main(tiling_enabled=False)) # non-tiled + sys.exit(main(tiling_enabled=True)) # tiled (SBTiler) +""" + +import os +import sys +from pathlib import Path +from typing import Optional + +# gapy (gvsoc launcher) uses `#!/usr/bin/env python3`. Put /usr/bin first so +# it resolves to /usr/bin/python3 which has all required packages (gapylib, +# prettytable, …) rather than the minimal venv python. +os.environ['PATH'] = '/usr/bin:' + os.environ.get('PATH', '') + +from .core import DeeployTestConfig, run_complete_test +from .core.paths import get_test_paths +from .deeployRunner import DeeployRunnerArgumentParser, print_colored_result, print_configuration + + +def main(tiling_enabled: bool = False, default_platform: str = 'Siracusa', default_simulator: str = 'gvsoc'): + """ + Build parser, parse args, create DeeployTestConfig, and run the training test. + + Parameters + ---------- + tiling_enabled: + True → passes tiling args (--l1, --l2, …) and sets tiling=True in config. + default_platform: + Platform used when -p is not given on the command line. + default_simulator: + Simulator used when -s is not given on the command line. + """ + + parser = DeeployRunnerArgumentParser(tiling_arguments = tiling_enabled, platform_required = False) + + parser.add_argument('--cores', type = int, default = 8, help = 'Number of cluster cores (default: 8)\n') + parser.add_argument('--n-steps', + metavar = '', + dest = 'n_steps', + type = int, + default = None, + help = 'N_TRAIN_STEPS: optimizer steps (auto-detected if not given)\n') + parser.add_argument('--n-accum', + metavar = '', + dest = 'n_accum', + type = int, + default = None, + help = 'N_ACCUM_STEPS: mini-batches per update step (auto-detected if not given)\n') + parser.add_argument('--num-data-inputs', + metavar = '', + dest = 'num_data_inputs', + type = int, + default = None, + help = 'Inputs that change each mini-batch (auto-detected if not given)\n') + parser.add_argument('--optimizer-dir', + metavar = '', + dest = 'optimizer_dir', + type = str, + default = None, + help = 'Directory containing the optimizer network.onnx ' + "(default: auto-derived by replacing '_train' with '_optimizer')\n") + parser.add_argument('--tolerance', + metavar = '', + dest = 'tolerance', + type = float, + default = None, + help = 'Absolute loss tolerance for pass/fail comparison (default: auto from generateTrainingNetwork.py)\n') + + args = parser.parse_args() + + platform = default_platform + simulator = args.simulator if args.simulator else default_simulator + + script_path = Path(__file__).resolve() + base_dir = script_path.parent.parent + + gen_dir, test_dir_abs, test_name = get_test_paths(args.dir, platform, base_dir = str(base_dir)) + + worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'master') + build_dir = str(base_dir / f'TEST_{platform.upper()}' / f'build_{worker_id}') + + cmake_args = [f'-DNUM_CORES={args.cores}'] + if args.cmake: + cmake_args.extend(args.cmake) + + gen_args = [f'--cores={args.cores}'] + if args.tolerance is not None: + gen_args.append(f'--tolerance={args.tolerance}') + if args.input_type_map: + gen_args.extend(['--input-type-map'] + list(args.input_type_map)) + if args.input_offset_map: + gen_args.extend(['--input-offset-map'] + list(args.input_offset_map)) + + if tiling_enabled: + if getattr(args, 'defaultMemLevel', None): + gen_args.append(f'--defaultMemLevel={args.defaultMemLevel}') + if getattr(args, 'l1', None): + gen_args.append(f'--l1={args.l1}') + if getattr(args, 'l2', None) and args.l2 != 1024000: + gen_args.append(f'--l2={args.l2}') + if getattr(args, 'memAllocStrategy', None): + gen_args.append(f'--memAllocStrategy={args.memAllocStrategy}') + if getattr(args, 'searchStrategy', None): + gen_args.append(f'--searchStrategy={args.searchStrategy}') + if getattr(args, 'profileTiling', False): + gen_args.append('--profileTiling') + if getattr(args, 'plotMemAlloc', False): + gen_args.append('--plotMemAlloc') + + config = DeeployTestConfig( + test_name = test_name, + test_dir = test_dir_abs, + platform = platform, + simulator = simulator, + tiling = tiling_enabled, + gen_dir = gen_dir, + build_dir = build_dir, + toolchain = args.toolchain, + toolchain_install_dir = args.toolchain_install_dir, + cmake_args = cmake_args, + gen_args = gen_args, + verbose = args.verbose, + debug = args.debug, + training = True, + n_train_steps = args.n_steps, + n_accum_steps = args.n_accum, + training_num_data_inputs = args.num_data_inputs, + optimizer_dir = args.optimizer_dir, + ) + + print_configuration(config) + + try: + result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim) + print_colored_result(result, config.test_name) + return 0 if result.success else 1 + except Exception as e: + RED = '\033[91m' + RESET = '\033[0m' + print(f'\n{RED}✗ Test {config.test_name} FAILED with exception: {e}{RESET}') + return 1 diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 0c3986cd6e..1e4b143cfb 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -2,11 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union +from typing import Dict, List, Optional, Tuple, Union from ortools.constraint_solver.pywrapcp import IntVar from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import PatternMemoryConstraints +from Deeploy.TilingExtension.MemoryScheduler import MemoryScheduler from Deeploy.TilingExtension.TilerExtension import Tiler from Deeploy.TilingExtension.TilerModel import TilerModel @@ -43,3 +45,28 @@ class SBTiler(Tiler): def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], hop: str, tensorName: str) -> Union[int, IntVar]: return 1 + + +class TrainingMemoryScheduler(MemoryScheduler): + """MemoryScheduler variant for training networks. + + Extends input tensor lifetimes to the end of the full tiling schedule so + that forward-pass inputs remain live during the backward pass. + """ + + def _calculateLifetimes( + self, ctxt: NetworkContext, patternMemoryConstraint: PatternMemoryConstraints, + memoryLevel: str) -> Tuple[Dict[str, Tuple[int, int]], Dict]: + tensorLifetimeMap, tensorMap = super()._calculateLifetimes(ctxt, patternMemoryConstraint, memoryLevel) + + maxStepIdx = len(patternMemoryConstraint.nodeConstraints) + for tensorName, lifetime in tensorLifetimeMap.items(): + buffer = ctxt.lookup(tensorName) + if buffer.is_input: + tensorLifetimeMap[tensorName] = (0, maxStepIdx) + + return tensorLifetimeMap, tensorMap + + +class TrainingSBTiler(SBTiler): + memorySchedulerClass = TrainingMemoryScheduler diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt index ca4c3ffbeb..26d23f2f78 100644 --- a/TargetLibraries/GAP9/CMakeLists.txt +++ b/TargetLibraries/GAP9/CMakeLists.txt @@ -80,5 +80,28 @@ endif() target_link_libraries(deeploygap9 PUBLIC pulp-nn-mixed) +# Add pulp-trainlib integration for training/gradient operators +set(PULP_TRAINLIB_DIR ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/third_party/pulp-trainlib) + +target_include_directories(deeploygap9 PUBLIC + ${PULP_TRAINLIB_DIR}/lib/include +) + +file(GLOB PULP_TRAINLIB_CONV_SOURCES + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_conv2d_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_im2col_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_conv_dw_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_conv_pw_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_matmul_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_train_utils_fp32.c" + "${PULP_TRAINLIB_DIR}/lib/sources/pulp_conv_naive_fp32.c" +) +target_sources(deeploygap9 PRIVATE ${PULP_TRAINLIB_CONV_SOURCES}) + +set_source_files_properties(${PULP_TRAINLIB_CONV_SOURCES} + PROPERTIES + COMPILE_FLAGS "-UUSE_DMA -Dfloat16alt=float -fno-strict-aliasing" +) + target_link_libraries(deeploygap9 PUBLIC m) diff --git a/TargetLibraries/GAP9/inc/DeeployGAP9Math.h b/TargetLibraries/GAP9/inc/DeeployGAP9Math.h index 0efa74c72e..bd5976b683 100644 --- a/TargetLibraries/GAP9/inc/DeeployGAP9Math.h +++ b/TargetLibraries/GAP9/inc/DeeployGAP9Math.h @@ -24,4 +24,6 @@ #include "pmsis.h" +#include "DeeployPULPKernels.h" + #endif // __DEEPLOY_MATH_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/Layernorm.h b/TargetLibraries/Generic/inc/kernel/Layernorm.h index 381f184dd6..deb60d30ed 100644 --- a/TargetLibraries/Generic/inc/kernel/Layernorm.h +++ b/TargetLibraries/Generic/inc/kernel/Layernorm.h @@ -27,6 +27,11 @@ void Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, float32_t *grad_out, float32_t *scale, - float32_t *bias, float32_t epsilon, int32_t size, + float32_t epsilon, int32_t size, int32_t lastDimLength); + +void LayernormGradParam_fp32_fp32(float32_t *grad_in, float32_t *data_in, + float32_t *weight_grad, float32_t *bias_grad, + float32_t epsilon, int32_t size, + int32_t lastDimLength); #endif //__DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/src/GELU_fp32.c b/TargetLibraries/Generic/src/GELU_fp32.c index 6cafed1986..dcbcae8176 100644 --- a/TargetLibraries/Generic/src/GELU_fp32.c +++ b/TargetLibraries/Generic/src/GELU_fp32.c @@ -34,20 +34,13 @@ void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out, void GELU_fp32_fp32_sigmoid_grad_chunk(float32_t *grad_in, float32_t *data_in, float32_t *grad_out, int32_t start_idx, int32_t end_idx) { - // d(Gelu)/dx ≈ sigmoid(1.702 * x) + x * sigmoid(1.702 * x) * (1 - - // sigmoid(1.702 * x)) * 1.702 - const float COEFF = 1.702f; + // Exact GELU gradient: gelu'(x) = 0.5*(1+erf(x/sqrt(2))) + x*exp(-0.5*x^2)/sqrt(2*pi) + // 1/sqrt(2) = 0.70710678f + // 1/sqrt(2*pi) = 0.39894228f for (int32_t i = start_idx; i < end_idx; i++) { float x = data_in[i]; - float upstream_grad = grad_in[i]; - float z = COEFF * x; - float sigmoid_z = 1.0f / (1.0f + expf(-z)); - - // d(Gelu)/dx = sigmoid(1.702*x) + x * sigmoid(1.702*x) * - // (1-sigmoid(1.702*x)) * 1.702 - float sigmoid_derivative = sigmoid_z * (1.0f - sigmoid_z) * COEFF; - float gelu_derivative = sigmoid_z + x * sigmoid_derivative; - - grad_out[i] = upstream_grad * gelu_derivative; + float gelu_derivative = 0.5f * (1.0f + erff(x * 0.70710678f)) + + x * expf(-0.5f * x * x) * 0.39894228f; + grad_out[i] = grad_in[i] * gelu_derivative; } } diff --git a/TargetLibraries/Generic/src/Layernorm_fp32.c b/TargetLibraries/Generic/src/Layernorm_fp32.c index fb68df8dfe..f8bc77173d 100644 --- a/TargetLibraries/Generic/src/Layernorm_fp32.c +++ b/TargetLibraries/Generic/src/Layernorm_fp32.c @@ -39,14 +39,14 @@ void Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, float32_t *grad_out, float32_t *scale, - float32_t *bias, float32_t epsilon, int32_t size, + float32_t epsilon, int32_t size, int32_t lastDimLength) { float32_t mean, variance, std, inv_std; - float32_t sum_dy, sum_dy_scaled, sum_dy_scaled_centered; + float32_t sum_dy, sum_dy_scaled_centered; float32_t centered_input; for (int i = 0; i < (size / lastDimLength); i++) { - // RW: Step 1: Recompute mean and variance from forward pass + // Step 1: Recompute mean and variance from forward pass mean = 0.0f; variance = 0.0f; @@ -64,11 +64,10 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, std = sqrtf(variance); inv_std = 1.0f / std; - // RW: Step 2: Compute intermediate values needed for gradient calculation + // Step 2: Compute intermediate values needed for gradient calculation sum_dy = 0.0f; sum_dy_scaled_centered = 0.0f; - // RW: Calculate sum(dy) and sum(dy * scale * (x - mean) / std) for (int j = 0; j < lastDimLength; j++) { sum_dy += grad_in[j + i * lastDimLength]; centered_input = data_in[j + i * lastDimLength] - mean; @@ -76,11 +75,10 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, grad_in[j + i * lastDimLength] * scale[j] * centered_input * inv_std; } - // RW: Step 3: Calculate gradients for each element + // Step 3: Calculate dX gradient for each element for (int j = 0; j < lastDimLength; j++) { centered_input = data_in[j + i * lastDimLength] - mean; - // Gradient formula: // dx = (1/std) * scale * (dy - (1/N)*sum(dy) - // (x-mean)/(N*std^2)*sum(dy*scale*(x-mean)/std)) grad_out[j + i * lastDimLength] = @@ -91,3 +89,44 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, } } } + +void LayernormGradParam_fp32_fp32(float32_t *grad_in, float32_t *data_in, + float32_t *weight_grad, float32_t *bias_grad, + float32_t epsilon, int32_t size, + int32_t lastDimLength) { + float32_t mean, variance, std, inv_std; + float32_t centered_input, hat_x; + int32_t num_sequences = size / lastDimLength; + + // Initialize output gradients to zero + for (int j = 0; j < lastDimLength; j++) { + weight_grad[j] = 0.0f; + bias_grad[j] = 0.0f; + } + + for (int i = 0; i < num_sequences; i++) { + // Recompute mean and variance from forward pass + mean = 0.0f; + for (int j = 0; j < lastDimLength; j++) { + mean += data_in[j + i * lastDimLength]; + } + mean = mean / lastDimLength; + + variance = 0.0f; + for (int j = 0; j < lastDimLength; j++) { + centered_input = data_in[j + i * lastDimLength] - mean; + variance += centered_input * centered_input; + } + variance = variance / lastDimLength; + variance += epsilon; + std = sqrtf(variance); + inv_std = 1.0f / std; + + // Accumulate dscale and dbias over sequences + for (int j = 0; j < lastDimLength; j++) { + hat_x = (data_in[j + i * lastDimLength] - mean) * inv_std; + weight_grad[j] += grad_in[j + i * lastDimLength] * hat_x; + bias_grad[j] += grad_in[j + i * lastDimLength]; + } + } +} diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index 1a510c945b..04c51cafab 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -43,6 +43,30 @@ add_subdirectory(../third_party/pulp-nn-mixed ${CMAKE_CURRENT_BINARY_DIR}/pulp-n target_include_directories(pulp-nn-mixed PUBLIC ${PULP_SDK_INCLUDES}) target_compile_options(pulp-nn-mixed PUBLIC ${PULP_SDK_COMPILE_FLAGS}) +# Add pulp-trainlib integration +target_include_directories(deeploypulp PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/include +) + +# Add necessary pulp-trainlib sources for ConvGradW +file(GLOB PULP_TRAINLIB_CONV_SOURCES + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_conv2d_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_im2col_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_conv_dw_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_conv_pw_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_matmul_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_train_utils_fp32.c" + "${CMAKE_CURRENT_LIST_DIR}/third_party/pulp-trainlib/lib/sources/pulp_conv_naive_fp32.c" +) +target_sources(deeploypulp PRIVATE ${PULP_TRAINLIB_CONV_SOURCES}) + +# Fix compilation issues with pulp-trainlib sources +# Undefine USE_DMA and define float16alt to avoid conflicts +set_source_files_properties(${PULP_TRAINLIB_CONV_SOURCES} + PROPERTIES + COMPILE_FLAGS "-UUSE_DMA -Dfloat16alt=float" +) + target_link_libraries(deeploypulp PUBLIC pulp-nn-mixed) target_link_libraries(deeploypulp INTERFACE pulp-sdk) target_sources(deeploypulp INTERFACE $) diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPKernels.h b/TargetLibraries/PULPOpen/inc/DeeployPULPKernels.h new file mode 100644 index 0000000000..711d68f378 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPKernels.h @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + * + * Aggregated include for all PULPOpen kernel headers. + * This file lives in PULPOpen/inc so that platforms (e.g. GAP9) can + * include it unambiguously, avoiding shadowing by Generic kernel headers. + */ + +#ifndef __DEEPLOY_PULP_KERNELS_HEADER_ +#define __DEEPLOY_PULP_KERNELS_HEADER_ + +#include "kernel/AvgPool.h" +#include "kernel/BatchNorm.h" +#include "kernel/Conv.h" +#include "kernel/GELU.h" +#include "kernel/Layernorm.h" +#include "kernel/Matmul.h" +#include "kernel/MaxPool.h" +#include "kernel/Relu.h" +#include "kernel/RQiHardswish.h" +#include "kernel/RequantShift.h" +#include "kernel/Softmax.h" +#include "kernel/UniformRequantShift.h" +#include "kernel/gemm.h" +#include "kernel/gemv.h" +#include "kernel/iRMSnorm.h" + +#endif // __DEEPLOY_PULP_KERNELS_HEADER_ diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c97..468f016b8e 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -23,15 +23,19 @@ #include "pmsis.h" +#include "kernel/AvgPool.h" +#include "kernel/BatchNorm.h" #include "kernel/Conv.h" #include "kernel/GELU.h" #include "kernel/Layernorm.h" #include "kernel/Matmul.h" #include "kernel/MaxPool.h" +#include "kernel/Relu.h" #include "kernel/RQiHardswish.h" #include "kernel/RequantShift.h" #include "kernel/Softmax.h" #include "kernel/UniformRequantShift.h" +#include "kernel/gemm.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" diff --git a/TargetLibraries/PULPOpen/inc/kernel/AvgPool.h b/TargetLibraries/PULPOpen/inc/kernel/AvgPool.h new file mode 100644 index 0000000000..ec1c399602 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/kernel/AvgPool.h @@ -0,0 +1,63 @@ +/* + * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_AVGPOOL_KERNEL_HEADER_ +#define __DEEPLOY_MATH_AVGPOOL_KERNEL_HEADER_ + +#include "DeeployPULPMath.h" + +void PULP_AvgPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, + uint32_t W, uint32_t H, uint32_t C, + uint32_t Q, uint32_t P, uint32_t SQ, + uint32_t SP, float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right); + + +void PULP_AvgPool2d_fp32_fp32_CHW(const float32_t *__restrict__ pSrcA, + uint32_t C, uint32_t H, uint32_t W, + uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right); + +/** + * @brief Global Average Pooling forward pass (NCHW layout). + * + * For each (n, c), computes the mean over all (h, w) spatial positions: + * output[n*C + c] = sum_{h,w}(input[(n*C+c)*H*W + h*W + w]) / (H*W) + * + * Parallelized over channels: each core handles a contiguous chunk of channels. + * + * @param input Input tensor [N, C, H, W] NCHW float32 + * @param output Output tensor [N, C, 1, 1] stored as [N*C] float32 + * @param N Batch size + * @param C Number of channels + * @param H Spatial height + * @param W Spatial width + */ +void PULP_GlobalAveragePool_fp32(const float32_t *input, float32_t *output, + uint32_t N, uint32_t C, uint32_t H, uint32_t W); + +/** + * @brief Global Average Pooling backward pass (NCHW layout). + * + * Distributes the upstream gradient evenly across all spatial positions: + * dX[n,c,h,w] = dY[n*C + c] / (H*W) + * + * Parallelized over channels: each core handles a contiguous chunk of channels. + * + * @param dY Upstream gradient [N, C, 1, 1] stored as [N*C] float32 + * @param dX Gradient w.r.t. input [N, C, H, W] NCHW float32 + * @param N Batch size + * @param C Number of channels + * @param H Spatial height + * @param W Spatial width + */ +void PULP_GlobalAveragePoolGrad_fp32(const float32_t *dY, float32_t *dX, + uint32_t N, uint32_t C, uint32_t H, uint32_t W); + +#endif // __DEEPLOY_MATH_AVGPOOL_KERNEL_HEADER_ diff --git a/TargetLibraries/PULPOpen/inc/kernel/BatchNorm.h b/TargetLibraries/PULPOpen/inc/kernel/BatchNorm.h new file mode 100644 index 0000000000..de4c1a60f7 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/kernel/BatchNorm.h @@ -0,0 +1,172 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_BATCHNORM_KERNEL_HEADER_ +#define __DEEPLOY_MATH_BATCHNORM_KERNEL_HEADER_ + +#include "DeeployPULPMath.h" + +/** + * @brief Training-mode Batch Normalization forward pass (BatchNormInternal). + * + * Normalizes each channel over (N, H, W) using batch statistics. + * Saves batch mean and 1/sqrt(var+eps) for use by the backward pass. + * Running statistics are read but NOT updated (outputs 1,2 have no consumers). + * + * Parallelized over channels: each core handles a contiguous chunk of channels. + * + * @param X Input [N, C, H_in, W_in] NCHW float32 + * @param gamma Affine scale [C] + * @param beta Affine bias [C] + * @param running_mean EMA running mean [C] (read only) + * @param running_var EMA running variance [C] (read only) + * @param Y Output [N, C, H_in, W_in] + * @param saved_mean Stash: per-channel batch mean [C] + * @param saved_inv_std Stash: per-channel 1/sqrt(var+eps) [C] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + * @param epsilon Numerical stability term + * @param momentum EMA coefficient (currently unused — running stats not updated) + */ +void PULP_BatchNormInternal_fp32(const float32_t *X, const float32_t *gamma, + const float32_t *beta, const float32_t *running_mean, + const float32_t *running_var, float32_t *Y, + float32_t *saved_mean, float32_t *saved_inv_std, + uint32_t N, uint32_t C, uint32_t H_in, uint32_t W_in, + float32_t epsilon, float32_t momentum); + +/** + * @brief Batch Normalization backward pass. + * + * Computes gradients w.r.t. input (dX), scale (dgamma), and bias (dbeta). + * Uses the saved_mean / saved_inv_std stash from the forward pass. + * + * Standard BN backward formula (Ioffe & Szegedy, 2015): + * x_hat[n,c,h,w] = (X[n,c,h,w] - saved_mean[c]) * saved_inv_std[c] + * dbeta[c] = sum_{n,h,w} dY[n,c,h,w] + * dgamma[c] = sum_{n,h,w} dY[n,c,h,w] * x_hat[n,c,h,w] + * dx_hat = dY * gamma[c] + * dX[n,c,h,w] = saved_inv_std[c] / N_total * + * (N_total * dx_hat - dbeta[c] - x_hat * dgamma[c]) + * + * Parallelized over channels: each core handles a contiguous chunk of channels. + * + * @param dY Upstream gradient [N, C, H_in, W_in] + * @param X Original input from forward [N, C, H_in, W_in] + * @param gamma Affine scale [C] + * @param saved_mean Stash batch mean from forward [C] + * @param saved_inv_std Stash 1/sqrt(var+eps) from forward [C] + * @param dX Output: gradient w.r.t. input [N, C, H_in, W_in] + * @param dgamma Output: gradient w.r.t. scale [C] + * @param dbeta Output: gradient w.r.t. bias [C] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + * @param epsilon Numerical stability term (unused — inv_std already computed) + */ +void PULP_BatchNormGrad_fp32(const float32_t *dY, const float32_t *X, + const float32_t *gamma, const float32_t *saved_mean, + const float32_t *saved_inv_std, float32_t *dX, + float32_t *dgamma, float32_t *dbeta, uint32_t N, uint32_t C, + uint32_t H_in, uint32_t W_in, float32_t epsilon); + +/** + * @brief Welford reduction: compute per-channel mean and 1/sqrt(var+eps) over (N,H,W). + * + * This is the reduction half of a split BN forward pass. + * Parallelized over channels. + * + * @param X Input [N, C, H_in, W_in] NCHW float32 + * @param saved_mean Output: per-channel batch mean [C] + * @param saved_inv_std Output: per-channel 1/sqrt(var+eps) [C] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + * @param epsilon Numerical stability term + */ +void PULP_WelfordReduce_fp32(const float32_t *X, float32_t *saved_mean, + float32_t *saved_inv_std, uint32_t N, uint32_t C, + uint32_t H_in, uint32_t W_in, float32_t epsilon); + +/** + * @brief Channel normalize: Y = (X - mean) * inv_std * gamma + beta. + * + * This is the elementwise half of a split BN forward pass. + * Freely spatially tileable (no reduction). + * Parallelized over channels. + * + * @param X Input [N, C, H_in, W_in] + * @param saved_mean Per-channel batch mean [C] + * @param saved_inv_std Per-channel 1/sqrt(var+eps) [C] + * @param gamma Affine scale [C] + * @param beta Affine bias [C] + * @param Y Output [N, C, H_in, W_in] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + */ +void PULP_ChannelNormalize_fp32(const float32_t *X, const float32_t *saved_mean, + const float32_t *saved_inv_std, const float32_t *gamma, + const float32_t *beta, float32_t *Y, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in); + +/** + * @brief BN gradient reduction: compute dgamma and dbeta over (N,H,W). + * + * This is the reduction half of a split BN backward pass. + * Parallelized over channels. + * + * @param dY Upstream gradient [N, C, H_in, W_in] + * @param X Original input from forward [N, C, H_in, W_in] + * @param saved_mean Stash batch mean from forward [C] + * @param saved_inv_std Stash 1/sqrt(var+eps) from forward [C] + * @param dgamma Output: gradient w.r.t. scale [C] + * @param dbeta Output: gradient w.r.t. bias [C] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + */ +void PULP_BNGradReduce_fp32(const float32_t *dY, const float32_t *X, + const float32_t *saved_mean, const float32_t *saved_inv_std, + float32_t *dgamma, float32_t *dbeta, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in); + +/** + * @brief BN gradient normalize: compute dX using pre-computed dgamma, dbeta. + * + * This is the elementwise half of a split BN backward pass. + * Freely spatially tileable (no reduction). + * Parallelized over channels. + * + * @param dY Upstream gradient [N, C, H_in, W_in] + * @param X Original input from forward [N, C, H_in, W_in] + * @param saved_mean Stash batch mean from forward [C] + * @param saved_inv_std Stash 1/sqrt(var+eps) from forward [C] + * @param gamma Affine scale [C] + * @param dgamma Pre-computed gradient w.r.t. scale [C] + * @param dbeta Pre-computed gradient w.r.t. bias [C] + * @param dX Output: gradient w.r.t. input [N, C, H_in, W_in] + * @param N Batch size + * @param C Number of channels + * @param H_in Spatial height + * @param W_in Spatial width + * @param N_total_inv 1.0f / (N * H_full * W_full) — must be pre-computed with + * FULL spatial dims (not tile dims) for correct BN gradient. + */ +void PULP_BNGradNormalize_fp32(const float32_t *dY, const float32_t *X, + const float32_t *saved_mean, const float32_t *saved_inv_std, + const float32_t *gamma, const float32_t *dgamma, + const float32_t *dbeta, float32_t *dX, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in, + float32_t N_total_inv); + +#endif /* __DEEPLOY_MATH_BATCHNORM_KERNEL_HEADER_ */ diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index 3ebab54a0b..9a166e2a94 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -35,4 +35,95 @@ void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( uint32_t pad_left, uint32_t pad_right, float32_t *__restrict__ pContextBuffer); +// ============================================================================ +// Minimal pulp-trainlib interface +// ============================================================================ + + +void PULP_ConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, + uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, + float *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW_Im2Col_tiled( + const float *__restrict__ pGradOut, + uint32_t dim_im_out_x, uint32_t dim_im_out_y, uint32_t ch_im_out, + const float *__restrict__ pWeight, + uint32_t ch_im_in, + uint32_t dim_kernel_x, uint32_t dim_kernel_y, + uint32_t stride_h, uint32_t stride_w, + float *__restrict__ pGradIn, + uint32_t dim_im_in_x, uint32_t dim_im_in_y, + uint32_t padding_y_top, uint32_t padding_y_bottom, + uint32_t padding_x_left, uint32_t padding_x_right, + uint16_t offset_grad_in_h, uint16_t offset_grad_in_w, + uint16_t offset_grad_out_h, uint16_t offset_grad_out_w, + float *__restrict__ ctxtBuffer, uint32_t ctxtBufferSize, + float *__restrict__ btBuffer, uint32_t btBufferSize); + +void PULP_ConvGradW2d_fp32_fp32_fp32_CHW_Im2Col( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + float *__restrict__ ctxtBuffer, uint32_t ctxtBufferSize); + +void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_total, const float *__restrict__ pWeight, uint32_t P, uint32_t Q, + uint32_t SP, uint32_t SQ, float *__restrict__ pGradIn, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + +void PULP_DWConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + +// Pointwise (1x1) Convolution Gradient Functions +void PULP_PWConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, float *__restrict__ pGradWeight); + +void PULP_PWConvGradX2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, + float *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, + float *__restrict__ pTransposeBuffer, uint32_t transposeBufferSize); + +void PULP_DWConvGradX2d_fp32_fp32_fp32_CHW_tiled( + const float *__restrict__ pGradOut, + uint32_t dim_im_out_x, // H_out (tile) + uint32_t dim_im_out_y, // W_out (tile) + uint32_t ch_im_out, // C_out (full) + const float *__restrict__ pWeight, + uint32_t ch_im_in, // C_in (full) + uint32_t dim_kernel_x, // P (kernel H) + uint32_t dim_kernel_y, // Q (kernel W) + uint32_t stride_h, // SH + uint32_t stride_w, // SW + float *__restrict__ pGradIn, + uint32_t dim_im_in_x, // H_in (tile) + uint32_t dim_im_in_y, // W_in (tile) + uint32_t padding_x_left, // pad_top + uint32_t padding_x_right, // pad_bottom (unused here) + uint32_t padding_y_top, // pad_left + uint32_t padding_y_bottom,// pad_right (unused here) + uint16_t offset_grad_in_h, + uint16_t offset_grad_in_w, + uint16_t offset_grad_out_h, + uint16_t offset_grad_out_w +); #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h index cb56152bd6..ea638bb48e 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h @@ -9,8 +9,66 @@ #include "DeeployPULPMath.h" +/** + * @brief Forward LayerNorm: y = (x - mean) / inv_std_dev * scale + bias + * + * Parallelized across sequence positions (axis=-1 normalization). + * Writes mean and inv_std_dev stash tensors for use by the backward pass. + * + * @param data_in Input tensor [seq_length, lastDimLength] + * @param data_out Output tensor [seq_length, lastDimLength] + * @param scale Gain (gamma) [lastDimLength] + * @param bias Bias (beta) [lastDimLength] + * @param mean_out Stash: per-sequence mean [seq_length] + * @param inv_std_dev_out Stash: per-sequence 1/sqrt(var+eps) [seq_length] + * @param size Total number of elements (seq_length * lastDimLength) + * @param lastDimLength Normalization dimension size + * @param epsilon Numerical stability constant + */ void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, - float32_t *scale, float32_t *bias, uint32_t size, - uint32_t lastDimLength, float32_t epsilon); + float32_t *scale, float32_t *bias, + float32_t *mean_out, float32_t *inv_std_dev_out, + uint32_t size, uint32_t lastDimLength, + float32_t epsilon); + +/** + * @brief Backward LayerNorm: compute dX for a chunk of sequences. + * + * Uses pre-computed mean and inv_std_dev stash from the forward pass. + * Parallelized: each core calls this for its own chunk of sequences. + * + * @param dy Upstream gradient chunk [elem_count] + * @param x Forward input chunk [elem_count] + * @param mean Stash mean for this chunk [chunk_seq_count] + * @param inv_std_dev Stash inv_std_dev for this chunk [chunk_seq_count] + * @param dx Output: input gradient chunk [elem_count] + * @param gamma Scale parameter [lastDimLength] + * @param elem_count Number of elements in chunk + * @param lastDimLength Feature dimension size + */ +void PULP_LayernormGrad_fp32_fp32(const float32_t *dy, const float32_t *x, + const float32_t *mean, const float32_t *inv_std_dev, + float32_t *dx, const float32_t *gamma, + uint32_t elem_count, uint32_t lastDimLength); + +/** + * @brief Backward LayerNorm: compute dscale and dbias over all sequences. + * + * Uses pre-computed mean and inv_std_dev stash from the forward pass. + * Single-core (core 0) operation. + * + * @param dy Full upstream gradient [size] + * @param x Full forward input [size] + * @param mean Full stash mean [seq_length] + * @param inv_std_dev Full stash inv_std_dev [seq_length] + * @param dgamma Output: scale gradient [lastDimLength] + * @param dbeta Output: bias gradient [lastDimLength] + * @param size Total number of elements (seq_length * lastDimLength) + * @param lastDimLength Feature dimension size + */ +void PULP_LayernormGradParam_fp32_fp32(const float32_t *dy, const float32_t *x, + const float32_t *mean, const float32_t *inv_std_dev, + float32_t *dgamma, float32_t *dbeta, + uint32_t size, uint32_t lastDimLength); -#endif // __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER__ \ No newline at end of file +#endif // __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER__ diff --git a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h index b37487439f..1c3b131127 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h +++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h @@ -16,4 +16,13 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); +void PULP_MaxPoolGrad2d_fp32_fp32_HWC(const float32_t *__restrict__ pGradOut, + const float32_t *__restrict__ pInput, + uint32_t H_out, uint32_t W_out, uint32_t C, + uint32_t H_in, uint32_t W_in, + uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float32_t *__restrict__ pGradIn, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right); + #endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/inc/kernel/Relu.h b/TargetLibraries/PULPOpen/inc/kernel/Relu.h index 1c49bd1cd8..51d3e8d6e1 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Relu.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Relu.h @@ -11,4 +11,7 @@ void PULP_Relu_fp32_fp32(float32_t *input, float32_t *output, uint32_t size); +void PULP_ReluGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, + float32_t *grad_out, uint32_t size); + #endif // __DEEPLOY_MATH_RELU_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/AvgPool.c b/TargetLibraries/PULPOpen/src/AvgPool.c new file mode 100644 index 0000000000..af0185effd --- /dev/null +++ b/TargetLibraries/PULPOpen/src/AvgPool.c @@ -0,0 +1,192 @@ +/* + * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + + +void PULP_AvgPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, + uint32_t W, uint32_t H, uint32_t C, + uint32_t Q, uint32_t P, uint32_t SQ, + uint32_t SP, float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) { + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + for (uint32_t c = ch_start; c < ch_stop; ++c) { + + float32_t sum = 0.0f; + + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; ++p) { + int32_t h_in = h_in_start + p; + + //RW: Compiler Bug related to continue + // if (h_in < 0 || h_in >= (int32_t)H) continue; + + for (uint32_t q = 0; q < Q; ++q) { + int32_t w_in = w_in_start + q; + // if (w_in < 0 || w_in >= (int32_t)W) continue; + + uint32_t input_idx = (h_in * W + w_in) * C + c; + sum += pSrcA[input_idx]; + } + } + + uint32_t output_idx = (h_out * W_out + w_out) * C + c; + pDstC[output_idx] = sum / (float32_t)(P * Q); + } + } + } +} + +void PULP_AvgPool2d_fp32_fp32_CHW(const float32_t *__restrict__ pSrcA, + uint32_t C, uint32_t H, uint32_t W, + uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) +{ + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + + for (uint32_t c = ch_start; c < ch_stop; ++c) { + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + + float32_t sum = 0.0f; + + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; ++p) { + int32_t h_in = h_in_start + (int32_t)p; + + for (uint32_t q = 0; q < Q; ++q) { + int32_t sw_in = w_in_start + (int32_t)q; + + uint32_t input_idx = c * (H * W) + h_in * W + sw_in; + sum += pSrcA[input_idx]; + + } + } + uint32_t output_idx = c * (H_out * W_out) + h_out * W_out + w_out; + + pDstC[output_idx] = sum / (P*Q); + printf("output is %f\n", pDstC[output_idx]); + } + } + } +} + + +void PULP_AvgPoolGrad2d_fp32_fp32_HWC(const float32_t *__restrict__ pGradOut, + uint32_t H_out, uint32_t W_out, uint32_t C, + uint32_t H_in, uint32_t W_in, + uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float32_t *__restrict__ pGradIn, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) +{ + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + for (uint32_t h = 0; h < H_in; ++h) { + for (uint32_t w = 0; w < W_in; ++w) { + for (uint32_t c = ch_start; c < ch_stop; ++c) { + uint32_t idx = (h * W_in + w) * C + c; + pGradIn[idx] = 0.0f; + } + } + } + + + uint32_t H_pad = H_in + pad_top + pad_bottom; + uint32_t W_pad = W_in + pad_left + pad_right; + + + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + + + int32_t h_start_pad = (int32_t)h_out * (int32_t)SP; + int32_t w_start_pad = (int32_t)w_out * (int32_t)SQ; + + int32_t h_end_pad = h_start_pad + (int32_t)P; + int32_t w_end_pad = w_start_pad + (int32_t)Q; + + + if (h_end_pad > (int32_t)H_pad) h_end_pad = (int32_t)H_pad; + if (w_end_pad > (int32_t)W_pad) w_end_pad = (int32_t)W_pad; + + + int32_t actual_h_start = h_start_pad - (int32_t)pad_top; + int32_t actual_w_start = w_start_pad - (int32_t)pad_left; + int32_t actual_h_end = h_end_pad - (int32_t)pad_top; + int32_t actual_w_end = w_end_pad - (int32_t)pad_left; + + if (actual_h_start < 0) actual_h_start = 0; + if (actual_w_start < 0) actual_w_start = 0; + if (actual_h_end > (int32_t)H_in) actual_h_end = (int32_t)H_in; + if (actual_w_end > (int32_t)W_in) actual_w_end = (int32_t)W_in; + + int32_t win_h = actual_h_end - actual_h_start; + int32_t win_w = actual_w_end - actual_w_start; + + uint32_t pool_size; + if (win_h <= 0 || win_w <= 0) { + + pool_size = 1; + } else { + pool_size = (uint32_t)(win_h * win_w); + } + + for (uint32_t c = ch_start; c < ch_stop; ++c) { + uint32_t out_idx = (h_out * W_out + w_out) * C + c; + float32_t g = pGradOut[out_idx]; + + float32_t grad_per_pos = g / (float32_t)pool_size; + + + for (uint32_t p = 0; p < P; ++p) { + for (uint32_t q = 0; q < Q; ++q) { + int32_t in_h = h_start_pad + (int32_t)p - (int32_t)pad_top; + int32_t in_w = w_start_pad + (int32_t)q - (int32_t)pad_left; + + if ((uint32_t)in_h < H_in && (uint32_t)in_w < W_in) { + uint32_t in_idx = ((uint32_t)in_h * W_in + (uint32_t)in_w) * C + c; + pGradIn[in_idx] += grad_per_pos; + } + } + } + } + } + } +} \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/BatchNorm.c b/TargetLibraries/PULPOpen/src/BatchNorm.c new file mode 100644 index 0000000000..06f8756e7f --- /dev/null +++ b/TargetLibraries/PULPOpen/src/BatchNorm.c @@ -0,0 +1,304 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pmsis.h" + +#include "DeeployPULPMath.h" + +#include + +/* + * Training-mode Batch Normalization forward pass (BatchNormInternal). + * + * Computes per-channel batch statistics over (N, H, W) and normalizes. + * Saves batch mean and 1/sqrt(var+eps) into stash buffers for the backward pass. + * Running statistics are NOT updated (their outputs have no consumers in the graph). + * + * Layout: NCHW — element [n, c, h, w] lives at offset (n*C + c)*N_hw + h*W_in + w. + * + * Parallelism: channels are split evenly across cores. + */ +void PULP_BatchNormInternal_fp32(const float32_t *X, const float32_t *gamma, + const float32_t *beta, const float32_t *running_mean, + const float32_t *running_var, float32_t *Y, + float32_t *saved_mean, float32_t *saved_inv_std, + uint32_t N, uint32_t C, uint32_t H_in, uint32_t W_in, + float32_t epsilon, float32_t momentum) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + uint32_t N_total = N * N_hw; + float32_t inv_N = 1.0f / (float32_t)N_total; + + /* Split channels across cores */ + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + /* ── Compute batch mean ─────────────────────────────────────────────── */ + float32_t mean = 0.0f; + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + mean += x_nc[hw]; + } + } + mean *= inv_N; + saved_mean[c] = mean; + + /* ── Compute batch variance (unbiased=False) ────────────────────────── */ + float32_t var = 0.0f; + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t diff = x_nc[hw] - mean; + var += diff * diff; + } + } + var *= inv_N; + + float32_t inv_std = 1.0f / sqrtf(var + epsilon); + saved_inv_std[c] = inv_std; + + float32_t g = gamma[c]; + float32_t b = beta[c]; + + /* ── Normalize and apply affine transform ───────────────────────────── */ + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + float32_t *y_nc = Y + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + y_nc[hw] = (x_nc[hw] - mean) * inv_std * g + b; + } + } + } +} + +/* + * Batch Normalization backward pass. + * + * Uses saved_mean / saved_inv_std stash from the forward pass to compute: + * dbeta[c] = sum_{n,h,w} dY[n,c,h,w] + * dgamma[c] = sum_{n,h,w} dY[n,c,h,w] * x_hat[n,c,h,w] + * dX[n,c,h,w] = inv_std / N_total * (N_total * dY[n,c,h,w] * gamma[c] + * - dbeta[c] + * - x_hat[n,c,h,w] * dgamma[c]) + * + * Parallelism: channels are split evenly across cores. + * Each core independently computes its channel slice of dgamma, dbeta, dX. + */ +/* + * Welford reduction: compute per-channel mean and 1/sqrt(var+eps). + * Parallelism: channels split across cores. + */ +void PULP_WelfordReduce_fp32(const float32_t *X, float32_t *saved_mean, + float32_t *saved_inv_std, uint32_t N, uint32_t C, + uint32_t H_in, uint32_t W_in, float32_t epsilon) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + uint32_t N_total = N * N_hw; + float32_t inv_N = 1.0f / (float32_t)N_total; + + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + float32_t mean = 0.0f; + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + mean += x_nc[hw]; + } + } + mean *= inv_N; + saved_mean[c] = mean; + + float32_t var = 0.0f; + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t diff = x_nc[hw] - mean; + var += diff * diff; + } + } + var *= inv_N; + saved_inv_std[c] = 1.0f / sqrtf(var + epsilon); + } +} + +/* + * Channel normalize: Y = (X - mean) * inv_std * gamma + beta. + * Freely spatially tileable. Parallelism: channels split across cores. + */ +void PULP_ChannelNormalize_fp32(const float32_t *X, const float32_t *saved_mean, + const float32_t *saved_inv_std, const float32_t *gamma, + const float32_t *beta, float32_t *Y, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + float32_t mean = saved_mean[c]; + float32_t inv_std = saved_inv_std[c]; + float32_t g = gamma[c]; + float32_t b = beta[c]; + + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + float32_t *y_nc = Y + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + y_nc[hw] = (x_nc[hw] - mean) * inv_std * g + b; + } + } + } +} + +/* + * BN gradient reduction: compute dgamma and dbeta. + * Parallelism: channels split across cores. + */ +void PULP_BNGradReduce_fp32(const float32_t *dY, const float32_t *X, + const float32_t *saved_mean, const float32_t *saved_inv_std, + float32_t *dgamma, float32_t *dbeta, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + float32_t mean = saved_mean[c]; + float32_t inv_std = saved_inv_std[c]; + + float32_t sum_dbeta = 0.0f; + float32_t sum_dgamma = 0.0f; + + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + const float32_t *dy_nc = dY + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t x_hat = (x_nc[hw] - mean) * inv_std; + sum_dbeta += dy_nc[hw]; + sum_dgamma += dy_nc[hw] * x_hat; + } + } + dgamma[c] = sum_dgamma; + dbeta[c] = sum_dbeta; + } +} + +/* + * BN gradient normalize: compute dX using pre-computed dgamma, dbeta. + * Freely spatially tileable. Parallelism: channels split across cores. + * + * N_total_inv = 1.0f / (N * H_full * W_full), pre-computed with FULL spatial dims. + */ +void PULP_BNGradNormalize_fp32(const float32_t *dY, const float32_t *X, + const float32_t *saved_mean, const float32_t *saved_inv_std, + const float32_t *gamma, const float32_t *dgamma, + const float32_t *dbeta, float32_t *dX, uint32_t N, + uint32_t C, uint32_t H_in, uint32_t W_in, + float32_t N_total_inv) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + /* N_total for the BN formula uses FULL spatial extents (passed as N_total_inv). */ + float32_t N_total_f = 1.0f / N_total_inv; + + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + float32_t mean = saved_mean[c]; + float32_t inv_std = saved_inv_std[c]; + float32_t g = gamma[c]; + float32_t dg = dgamma[c]; + float32_t db = dbeta[c]; + float32_t scale = inv_std * N_total_inv; + + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + const float32_t *dy_nc = dY + (n * C + c) * N_hw; + float32_t *dx_nc = dX + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t x_hat = (x_nc[hw] - mean) * inv_std; + float32_t dx_hat = dy_nc[hw] * g; + dx_nc[hw] = scale * (N_total_f * dx_hat - db - x_hat * dg); + } + } + } +} + +void PULP_BatchNormGrad_fp32(const float32_t *dY, const float32_t *X, + const float32_t *gamma, const float32_t *saved_mean, + const float32_t *saved_inv_std, float32_t *dX, + float32_t *dgamma, float32_t *dbeta, uint32_t N, uint32_t C, + uint32_t H_in, uint32_t W_in, float32_t epsilon) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint32_t N_hw = H_in * W_in; + uint32_t N_total = N * N_hw; + float32_t inv_N = 1.0f / (float32_t)N_total; + + /* Split channels across cores */ + int32_t chunk = ((int32_t)C >> log2Core) + (((int32_t)C & (NUM_CORES - 1)) != 0); + int32_t c_start = MIN(chunk * core_id, (int32_t)C); + int32_t c_end = MIN(c_start + chunk, (int32_t)C); + + for (int32_t c = c_start; c < c_end; c++) { + float32_t mean = saved_mean[c]; + float32_t inv_std = saved_inv_std[c]; + float32_t g = gamma[c]; + + /* ── First pass: accumulate dbeta and dgamma ─────────────────────────── */ + float32_t sum_dbeta = 0.0f; + float32_t sum_dgamma = 0.0f; + + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + const float32_t *dy_nc = dY + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t x_hat = (x_nc[hw] - mean) * inv_std; + sum_dbeta += dy_nc[hw]; + sum_dgamma += dy_nc[hw] * x_hat; + } + } + dgamma[c] = sum_dgamma; + dbeta[c] = sum_dbeta; + + /* ── Second pass: compute dX ─────────────────────────────────────────── */ + float32_t scale = inv_std * inv_N; + + for (uint32_t n = 0; n < N; n++) { + const float32_t *x_nc = X + (n * C + c) * N_hw; + const float32_t *dy_nc = dY + (n * C + c) * N_hw; + float32_t *dx_nc = dX + (n * C + c) * N_hw; + for (uint32_t hw = 0; hw < N_hw; hw++) { + float32_t x_hat = (x_nc[hw] - mean) * inv_std; + float32_t dx_hat = dy_nc[hw] * g; + dx_nc[hw] = scale * ((float32_t)N_total * dx_hat - sum_dbeta - x_hat * sum_dgamma); + } + } + } +} diff --git a/TargetLibraries/PULPOpen/src/ConvGrad.c b/TargetLibraries/PULPOpen/src/ConvGrad.c new file mode 100644 index 0000000000..3e9c3fd502 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/ConvGrad.c @@ -0,0 +1,1088 @@ + +/* + * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +// ============================================================================ +// Minimal pulp-trainlib interface - avoiding pulp_train_defines.h conflicts +// ============================================================================ + +struct blob { + float *data; + float *diff; + int dim; + int W; + int H; + int C; +}; + +void pulp_conv2d_fp32_bw_param_grads_cl(void *Conv2D_args); +void pulp_conv2d_fp32_bw_input_grads_cl(void *Conv2D_args); + +struct Conv2D_args { + struct blob *input; + struct blob *coeff; + struct blob *bias; + struct blob *output; + int Lpad; + int Rpad; + int Upad; + int Dpad; + int stride_h; + int stride_w; + float *i2c_buffer; + float *bt_buffer; + int skip_wg_grad; + int skip_in_grad; + int HWC; + int opt_matmul_type_fw; + int opt_matmul_type_wg; + int opt_matmul_type_ig; + int USE_BIASES; + int USE_IM2COL; + int USE_DMA_IM2COL; +}; + +void pulp_conv_dw_fp32_bw_input_grads_cl(void *DepthWise_Conv_args); +void pulp_conv_dw_fp32_bw_param_grads_cl(void *DepthWise_Conv_args); + +struct DepthWise_Conv_args { + struct blob *input; + struct blob *coeff; + struct blob *output; + + int stride_h; + int stride_w; + + int Lpad; + int Rpad; + int Upad; + int Dpad; + + int skip_wg_grad; + int skip_in_grad; + + int HWC; +}; + +void pulp_conv_pw_fp32_bw_param_grads_cl(void *PointWise_Conv_args); +void pulp_conv_pw_fp32_bw_input_grads_cl(void *PointWise_Conv_args); + +struct PointWise_Conv_args { + struct blob *input; + struct blob *coeff; + struct blob *output; + float *transpose_buffer; + int skip_wg_grad; + int skip_in_grad; + int opt_matmul_type_fw; + int opt_matmul_type_wg; + int opt_matmul_type_ig; + int HWC; +}; + +// Minimal declarations for direct transpose + GEMM in PWConvGradX +struct transp_args { + float *in_matrix; + float *out_matrix; + int N; + int M; + int *dim; + int *transposed_axes; + int n_dim; +}; + +struct matMul_args { + float *__restrict__ A; + float *__restrict__ B; + float *__restrict__ C; + int N; + int M; + int K; + int trans_B; +}; + +void transpose_matrix(void *void_args); +void mm(void *void_args); + +void PULP_ConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) { + + struct blob input_blob = {0}; + struct blob output_blob = {0}; + struct blob coeff_blob = {0}; + struct blob bias_blob = {0}; + + input_blob.data = (float *)pInput; + input_blob.diff = NULL; + input_blob.W = W_in; + input_blob.H = H_in; + input_blob.C = C_in; + input_blob.dim = C_in * H_in * W_in; + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = W_out; + output_blob.H = H_out; + output_blob.C = C_out; + output_blob.dim = C_out * H_out * W_out; + + coeff_blob.data = NULL; + coeff_blob.diff = (float *)pGradWeight; + coeff_blob.W = Q; + coeff_blob.H = P; + coeff_blob.C = C_out; + coeff_blob.dim = C_out * C_in * P * Q; + + bias_blob.data = NULL; + bias_blob.diff = NULL; + bias_blob.W = 1; + bias_blob.H = 1; + bias_blob.C = C_out; + bias_blob.dim = C_out; + + struct Conv2D_args conv_args; + memset(&conv_args, 0, sizeof(conv_args)); + + conv_args.input = &input_blob; + conv_args.output = &output_blob; + conv_args.coeff = &coeff_blob; + conv_args.bias = &bias_blob; + + conv_args.Lpad = (int)pad_left; + conv_args.Rpad = (int)pad_right; + conv_args.Upad = (int)pad_top; + conv_args.Dpad = (int)pad_bottom; + conv_args.stride_h = (int)SP; + conv_args.stride_w = (int)SQ; + + conv_args.i2c_buffer = NULL; + conv_args.bt_buffer = NULL; + + conv_args.skip_wg_grad = 0; + conv_args.skip_in_grad = 1; + conv_args.HWC = 0; + conv_args.USE_BIASES = 0; + conv_args.USE_IM2COL = 0; + conv_args.USE_DMA_IM2COL = 0; + + pulp_conv2d_fp32_bw_param_grads_cl(&conv_args); +} + +void PULP_ConvGradW2d_fp32_fp32_fp32_CHW_Im2Col( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + float *__restrict__ ctxtBuffer, uint32_t ctxtBufferSize) { + + struct blob input_blob = {0}; + struct blob output_blob = {0}; + struct blob coeff_blob = {0}; + struct blob bias_blob = {0}; + + input_blob.data = (float *)pInput; + input_blob.diff = NULL; + input_blob.W = W_in; + input_blob.H = H_in; + input_blob.C = C_in; + input_blob.dim = C_in * H_in * W_in; + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = W_out; + output_blob.H = H_out; + output_blob.C = C_out; + output_blob.dim = C_out * H_out * W_out; + + coeff_blob.data = NULL; + coeff_blob.diff = (float *)pGradWeight; + coeff_blob.W = Q; + coeff_blob.H = P; + coeff_blob.C = C_in; + coeff_blob.dim = C_out * C_in * P * Q; + + bias_blob.data = NULL; + bias_blob.diff = NULL; + bias_blob.W = 1; + bias_blob.H = 1; + bias_blob.C = C_out; + bias_blob.dim = C_out; + + struct Conv2D_args conv_args; + memset(&conv_args, 0, sizeof(conv_args)); + + conv_args.input = &input_blob; + conv_args.output = &output_blob; + conv_args.coeff = &coeff_blob; + conv_args.bias = &bias_blob; + + conv_args.Lpad = (int)pad_left; + conv_args.Rpad = (int)pad_right; + conv_args.Upad = (int)pad_top; + conv_args.Dpad = (int)pad_bottom; + conv_args.stride_h = (int)SP; + conv_args.stride_w = (int)SQ; + + conv_args.i2c_buffer = ctxtBuffer; + conv_args.bt_buffer = NULL; + + conv_args.skip_wg_grad = 0; + conv_args.skip_in_grad = 1; + conv_args.HWC = 0; + conv_args.USE_BIASES = 0; + conv_args.USE_IM2COL = 1; + conv_args.USE_DMA_IM2COL = 0; + + pulp_conv2d_fp32_bw_param_grads_cl(&conv_args); +} + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW_trainlib( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, + uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, + float *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) { + + struct blob input_blob = (struct blob){0}; + struct blob output_blob = (struct blob){0}; + struct blob coeff_blob = (struct blob){0}; + struct blob bias_blob = (struct blob){0}; + + input_blob.data = NULL; + input_blob.diff = (float *)pGradIn; + input_blob.W = (int)W_in; + input_blob.H = (int)H_in; + input_blob.C = (int)C_in; + input_blob.dim = (int)(C_in * H_in * W_in); + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = (int)W_out; + output_blob.H = (int)H_out; + output_blob.C = (int)C_out; + output_blob.dim = (int)(C_out * H_out * W_out); + + coeff_blob.data = (float *)pWeight; + coeff_blob.diff = NULL; + coeff_blob.W = (int)Q; + coeff_blob.H = (int)P; + coeff_blob.C = (int)C_out; + coeff_blob.dim = (int)(C_out * C_in * P * Q); + + bias_blob.data = NULL; + bias_blob.diff = NULL; + bias_blob.W = 1; + bias_blob.H = 1; + bias_blob.C = (int)C_out; + bias_blob.dim = (int)C_out; + + struct Conv2D_args conv_args; + memset(&conv_args, 0, sizeof(conv_args)); + + conv_args.input = &input_blob; + conv_args.output = &output_blob; + conv_args.coeff = &coeff_blob; + conv_args.bias = &bias_blob; + + conv_args.Lpad = (int)pad_left; + conv_args.Rpad = (int)pad_right; + conv_args.Upad = (int)pad_top; + conv_args.Dpad = (int)pad_bottom; + conv_args.stride_h = (int)SP; + conv_args.stride_w = (int)SQ; + + conv_args.i2c_buffer = NULL; + conv_args.bt_buffer = NULL; + + conv_args.skip_wg_grad = 1; + conv_args.skip_in_grad = 0; + conv_args.HWC = 0; + conv_args.USE_BIASES = 0; + conv_args.USE_IM2COL = 0; + conv_args.USE_DMA_IM2COL = 0; + + pulp_conv2d_fp32_bw_input_grads_cl(&conv_args); +} + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW_Im2Col( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, + uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, + float *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + float *__restrict__ ctxtBuffer, uint32_t ctxtBufferSize, + float *__restrict__ btBuffer, uint32_t btBufferSize) { + + struct blob input_blob = {0}; + struct blob output_blob = {0}; + struct blob coeff_blob = {0}; + struct blob bias_blob = {0}; + + input_blob.data = NULL; + input_blob.diff = (float *)pGradIn; + input_blob.W = (int)W_in; + input_blob.H = (int)H_in; + input_blob.C = (int)C_in; + input_blob.dim = (int)(C_in * H_in * W_in); + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = (int)W_out; + output_blob.H = (int)H_out; + output_blob.C = (int)C_out; + output_blob.dim = (int)(C_out * H_out * W_out); + + coeff_blob.data = (float *)pWeight; + coeff_blob.diff = NULL; + coeff_blob.W = (int)Q; + coeff_blob.H = (int)P; + coeff_blob.C = (int)C_out; + coeff_blob.dim = (int)(C_out * C_in * P * Q); + + bias_blob.data = NULL; + bias_blob.diff = NULL; + bias_blob.W = 1; + bias_blob.H = 1; + bias_blob.C = (int)C_out; + bias_blob.dim = (int)C_out; + + struct Conv2D_args conv_args; + memset(&conv_args, 0, sizeof(conv_args)); + + conv_args.input = &input_blob; + conv_args.output = &output_blob; + conv_args.coeff = &coeff_blob; + conv_args.bias = &bias_blob; + + conv_args.Lpad = (int)pad_left; + conv_args.Rpad = (int)pad_right; + conv_args.Upad = (int)pad_top; + conv_args.Dpad = (int)pad_bottom; + conv_args.stride_h = (int)SP; + conv_args.stride_w = (int)SQ; + + conv_args.i2c_buffer = ctxtBuffer; + conv_args.bt_buffer = btBuffer; + + conv_args.skip_wg_grad = 1; + conv_args.skip_in_grad = 0; + conv_args.HWC = 0; + conv_args.USE_BIASES = 0; + conv_args.USE_IM2COL = 1; + conv_args.USE_DMA_IM2COL = 0; + + pulp_conv2d_fp32_bw_input_grads_cl(&conv_args); +} + +static inline int32_t max_i32(int32_t a, int32_t b) { return (a > b) ? a : b; } +static inline int32_t min_i32(int32_t a, int32_t b) { return (a < b) ? a : b; } + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW_tiled( + const float *__restrict__ pGradOut, + uint32_t dim_im_out_x, // H_out (tile) + uint32_t dim_im_out_y, // W_out (tile) + uint32_t ch_im_out, // C_out (full) + const float *__restrict__ pWeight, + uint32_t ch_im_in, // C_in (full) + uint32_t dim_kernel_x, // P (kernel H) + uint32_t dim_kernel_y, // Q (kernel W) + uint32_t stride_h, // SH + uint32_t stride_w, // SW + float *__restrict__ pGradIn, + uint32_t dim_im_in_x, // H_in (tile) + uint32_t dim_im_in_y, // W_in (tile) + uint32_t padding_x_left, // pad_top + uint32_t padding_x_right, // pad_bottom (unused here) + uint32_t padding_y_top, // pad_left + uint32_t padding_y_bottom,// pad_right (unused here) + uint16_t offset_grad_in_h, + uint16_t offset_grad_in_w, + uint16_t offset_grad_out_h, + uint16_t offset_grad_out_w +){ + (void)padding_x_right; + (void)padding_y_bottom; + + const uint32_t Hout_t = dim_im_out_x; + const uint32_t Wout_t = dim_im_out_y; + const uint32_t Hin_t = dim_im_in_x; + const uint32_t Win_t = dim_im_in_y; + + const uint32_t Cout = ch_im_out; + const uint32_t Cin = ch_im_in; + + const uint32_t P = dim_kernel_x; + const uint32_t Q = dim_kernel_y; + + const int32_t pad_top = (int32_t)padding_x_left; + const int32_t pad_left = (int32_t)padding_y_top; + + const int32_t sh = (int32_t)stride_h; + const int32_t sw = (int32_t)stride_w; + + const int32_t hx0 = (int32_t)offset_grad_in_h; + const int32_t wx0 = (int32_t)offset_grad_in_w; + const int32_t hx1 = hx0 + (int32_t)Hin_t - 1; + const int32_t wx1 = wx0 + (int32_t)Win_t - 1; + + // -------- core partition over Cin -------- + const int core_id = pi_core_id(); + const int ncores = NUM_CORES; + + const uint32_t ci_chunk = (Cin + (uint32_t)ncores - 1u) / (uint32_t)ncores; + const uint32_t ci_start = (uint32_t)core_id * ci_chunk; + uint32_t ci_stop = ci_start + ci_chunk; + if (ci_stop > Cin) ci_stop = Cin; + + if (ci_start >= ci_stop) { + return; + } + + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * Hin_t * Win_t; + for (uint32_t ih = 0; ih < Hin_t; ++ih) { + for (uint32_t iw = 0; iw < Win_t; ++iw) { + dx_ci[ih * Win_t + iw] = 0.0f; + } + } + } + + for (uint32_t co = 0; co < Cout; ++co) { + const float *dy_co = pGradOut + (size_t)co * Hout_t * Wout_t; + + for (uint32_t ly = 0; ly < Hout_t; ++ly) { + const int32_t oy = (int32_t)offset_grad_out_h + (int32_t)ly; + const int32_t base_h = oy * sh - pad_top; + + for (uint32_t lx = 0; lx < Wout_t; ++lx) { + const int32_t ox = (int32_t)offset_grad_out_w + (int32_t)lx; + const int32_t base_w = ox * sw - pad_left; + + const float dy_val = dy_co[ly * Wout_t + lx]; + + // prune ky/kx once per (co,ly,lx) (independent of ci) + int32_t ky_min = max_i32(0, hx0 - base_h); + int32_t ky_max = min_i32((int32_t)P - 1, hx1 - base_h); + if (ky_min > ky_max) continue; + + int32_t kx_min = max_i32(0, wx0 - base_w); + int32_t kx_max = min_i32((int32_t)Q - 1, wx1 - base_w); + if (kx_min > kx_max) continue; + + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * Hin_t * Win_t; + + // W[co,ci,:,:] base (assumes layout [Cout][Cin][P][Q]) + const float *w_co_ci = pWeight + + (((size_t)co * (size_t)Cin + (size_t)ci) * (size_t)P * (size_t)Q); + + for (int32_t ky = ky_min; ky <= ky_max; ++ky) { + const int32_t ih = (base_h + ky) - hx0; // local in tile + + for (int32_t kx = kx_min; kx <= kx_max; ++kx) { + const int32_t iw = (base_w + kx) - wx0; + + dx_ci[(uint32_t)ih * Win_t + (uint32_t)iw] += + dy_val * w_co_ci[(size_t)ky * (size_t)Q + (size_t)kx]; + } + } + } + } + } + } +} + +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, + uint32_t dim_im_out_x, // H_out (tile) + uint32_t dim_im_out_y, // W_out (tile) + uint32_t ch_im_out, // C_out (full) + const float *__restrict__ pWeight, + uint32_t ch_im_in, // C_in (full) + uint32_t dim_kernel_x, // P + uint32_t dim_kernel_y, // Q + uint32_t stride_h, // SP + uint32_t stride_w, // SQ + float *__restrict__ pGradIn, + uint32_t dim_im_in_x, // H_in (tile) + uint32_t dim_im_in_y, // W_in (tile) + uint32_t padding_x_left, // pad_top (tile-specific) + uint32_t padding_x_right, // pad_bottom (tile-specific) + uint32_t padding_y_top, // pad_left (tile-specific) + uint32_t padding_y_bottom // pad_right (tile-specific) +) { + // Map to more intuitive names + const uint32_t H_out = dim_im_out_x; + const uint32_t W_out = dim_im_out_y; + const uint32_t C_out = ch_im_out; + const uint32_t C_in = ch_im_in; + const uint32_t P = dim_kernel_x; + const uint32_t Q = dim_kernel_y; + const uint32_t H_in = dim_im_in_x; + const uint32_t W_in = dim_im_in_y; + const uint32_t pad_top = padding_x_left; + const uint32_t pad_bottom = padding_x_right; + const uint32_t pad_left = padding_y_top; + const uint32_t pad_right = padding_y_bottom; + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + // Parallelize over input channels (C_in) + uint16_t ch_chunk = (C_in >> log2Core) + ((C_in & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C_in); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C_in); + + // ========================================================================= + // Step 1: Zero-initialize dX tile for this core's channel range + // ========================================================================= + // CHW layout: [C, H, W] + for (uint32_t c_in = ch_start; c_in < ch_stop; ++c_in) { + for (uint32_t h = 0; h < H_in; ++h) { + for (uint32_t w = 0; w < W_in; ++w) { + uint32_t dx_idx = (c_in * H_in + h) * W_in + w; + pGradIn[dx_idx] = 0.0f; + } + } + } + + // ========================================================================= + // Step 2: Compute gradient via transposed convolution + // ========================================================================= + // For each input channel assigned to this core + for (uint32_t c_in = ch_start; c_in < ch_stop; ++c_in) { + // For each kernel position + for (uint32_t kh = 0; kh < P; ++kh) { + for (uint32_t kw = 0; kw < Q; ++kw) { + // For each output position in dY tile + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + // Compute corresponding input position + int32_t h_in = (int32_t)h_out * (int32_t)stride_h + + (int32_t)kh - (int32_t)pad_top; + + // Check bounds (tile-local) + if (h_in < 0 || h_in >= (int32_t)H_in) { + continue; + } + + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + // Compute corresponding input position + int32_t w_in = (int32_t)w_out * (int32_t)stride_w + + (int32_t)kw - (int32_t)pad_left; + + // Check bounds (tile-local) + if (w_in < 0 || w_in >= (int32_t)W_in) { + continue; + } + + // Accumulate gradient contributions from all output channels + // dX index: CHW layout [C_in, H_in, W_in] + uint32_t dx_idx = (c_in * H_in + (uint32_t)h_in) * W_in + (uint32_t)w_in; + + for (uint32_t c_out = 0; c_out < C_out; ++c_out) { + // dY index: CHW layout [C_out, H_out, W_out] + uint32_t dy_idx = (c_out * H_out + h_out) * W_out + w_out; + + // Weight index: [C_out, C_in, P, Q] layout + uint32_t w_idx = ((c_out * C_in + c_in) * P + kh) * Q + kw; + + // Accumulate: dX += dY * W + pGradIn[dx_idx] += pGradOut[dy_idx] * pWeight[w_idx]; + } + } + } + } + } + } +} + + +void PULP_DWConvTrans2d_fp32_fp32_fp32_HWC( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_total, const float *__restrict__ pWeight, uint32_t P, uint32_t Q, + uint32_t SP, uint32_t SQ, float *__restrict__ pGradIn, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) { + + uint32_t H_in = (H_out - 1) * SP + P - pad_top - pad_bottom; + uint32_t W_in = (W_out - 1) * SQ + Q - pad_left - pad_right; + + memset(pGradIn, 0, sizeof(float) * (C_total * H_in * W_in)); + struct blob input_blob = {0}; + struct blob coeff_blob = {0}; + struct blob output_blob = {0}; + + input_blob.data = NULL; + input_blob.diff = (float *)pGradIn; + input_blob.W = (int)W_in; + input_blob.H = (int)H_in; + input_blob.C = (int)C_total; + input_blob.dim = (int)(C_total * H_in * W_in); + + coeff_blob.data = (float *)pWeight; + coeff_blob.diff = NULL; + coeff_blob.W = (int)Q; + coeff_blob.H = (int)P; + coeff_blob.C = (int)C_total; + coeff_blob.dim = (int)(C_total * P * Q); + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = (int)W_out; + output_blob.H = (int)H_out; + output_blob.C = (int)C_total; + output_blob.dim = (int)(C_total * H_out * W_out); + + struct DepthWise_Conv_args dw_args; + memset(&dw_args, 0, sizeof(dw_args)); + + dw_args.input = &input_blob; + dw_args.coeff = &coeff_blob; + dw_args.output = &output_blob; + + dw_args.stride_h = (int)SP; + dw_args.stride_w = (int)SQ; + + dw_args.Lpad = (int)pad_left; + dw_args.Rpad = (int)pad_right; + dw_args.Upad = (int)pad_top; + dw_args.Dpad = (int)pad_bottom; + + dw_args.skip_wg_grad = 1; + dw_args.skip_in_grad = 0; + + dw_args.HWC = 0; + pulp_conv_dw_fp32_bw_input_grads_cl(&dw_args); +} + +void PULP_DWConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float *__restrict__ pGradWeight, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) { + // Supports depthwise convolution with multiplier + // For depthwise: groups = C_in, C_out = C_in * multiplier + // Weight shape: [C_out, 1, P, Q] + + uint32_t gradw_elems = C_out * (C_in / C_out) * P * Q; + + struct blob input_blob = {0}; + struct blob coeff_blob = {0}; + struct blob output_blob = {0}; + + input_blob.data = (float *)pInput; + input_blob.diff = NULL; + input_blob.W = (int)W_in; + input_blob.H = (int)H_in; + input_blob.C = (int)C_in; + input_blob.dim = (int)(C_in * H_in * W_in); + + coeff_blob.data = NULL; + coeff_blob.diff = (float *)pGradWeight; + coeff_blob.W = (int)Q; + coeff_blob.H = (int)P; + coeff_blob.C = (int)C_out; // Fixed: should be C_out for DW with multiplier + coeff_blob.dim = (int)(C_out * P * Q); // Fixed: total weight elements + + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = (int)W_out; + output_blob.H = (int)H_out; + output_blob.C = (int)C_out; + output_blob.dim = (int)(C_out * H_out * W_out); + + struct DepthWise_Conv_args dw_args; + memset(&dw_args, 0, sizeof(dw_args)); + + dw_args.input = &input_blob; + dw_args.coeff = &coeff_blob; + dw_args.output = &output_blob; + + dw_args.stride_h = (int)SP; + dw_args.stride_w = (int)SQ; + + dw_args.Lpad = (int)pad_left; + dw_args.Rpad = (int)pad_right; + dw_args.Upad = (int)pad_top; + dw_args.Dpad = (int)pad_bottom; + + dw_args.skip_wg_grad = 0; + dw_args.skip_in_grad = 1; + dw_args.HWC = 0; + pulp_conv_dw_fp32_bw_param_grads_cl(&dw_args); +} + +// ============================================================================ +// Pointwise Convolution Gradient Functions (using pulptrainlib pw interfaces) +// ============================================================================ + +void PULP_PWConvGradW2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, float *__restrict__ pGradWeight) { + + struct blob input_blob = {0}; + struct blob output_blob = {0}; + struct blob coeff_blob = {0}; + + // Input blob (forward activation) + input_blob.data = (float *)pInput; + input_blob.diff = NULL; + input_blob.W = (int)W_in; + input_blob.H = (int)H_in; + input_blob.C = (int)C_in; + input_blob.dim = (int)(C_in * H_in * W_in); + + // Output blob (gradient w.r.t. output) + output_blob.data = NULL; + output_blob.diff = (float *)pGradOut; + output_blob.W = (int)W_out; + output_blob.H = (int)H_out; + output_blob.C = (int)C_out; + output_blob.dim = (int)(C_out * H_out * W_out); + + // Weight blob (gradient w.r.t. weights - output) + // For PW conv: kernel is 1x1, so dim = C_out * C_in + coeff_blob.data = NULL; + coeff_blob.diff = (float *)pGradWeight; + coeff_blob.W = 1; + coeff_blob.H = 1; + coeff_blob.C = (int)C_in; + coeff_blob.dim = (int)(C_out * C_in); + + struct PointWise_Conv_args pw_args; + memset(&pw_args, 0, sizeof(pw_args)); + + pw_args.input = &input_blob; + pw_args.output = &output_blob; + pw_args.coeff = &coeff_blob; + pw_args.transpose_buffer = NULL; + + pw_args.skip_wg_grad = 0; // Compute weight gradient + pw_args.skip_in_grad = 1; // Skip input gradient + pw_args.HWC = 0; // CHW layout + pw_args.opt_matmul_type_fw = 0; + pw_args.opt_matmul_type_wg = 0; + pw_args.opt_matmul_type_ig = 0; + + pulp_conv_pw_fp32_bw_param_grads_cl(&pw_args); +} + + +void PULP_PWConvGradX2d_fp32_fp32_fp32_CHW( + const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, + float *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, + float *__restrict__ pTransposeBuffer, uint32_t transposeBufferSize) { + + // pulp_conv_pw_fp32_bw_input_grads_cl has a bug: it passes M=C_out, N=C_in + // to transpose_matrix, treating W as [C_in rows, C_out cols], but W is + // stored as [C_out, C_in] row-major. This works only when C_in == C_out. + // Fix: call transpose_matrix directly with N=C_out, M=C_in (correct dims), + // then call mm directly with the correctly transposed buffer. + + memset(pGradIn, 0, sizeof(float) * (C_in * H_in * W_in)); + + // Step 1: Transpose W[C_out, C_in] -> pTransposeBuffer[C_in, C_out] + // N = C_out (rows of W), M = C_in (cols of W) -> output [M=C_in, N=C_out] + struct transp_args tr_args; + tr_args.in_matrix = (float *)pWeight; + tr_args.out_matrix = pTransposeBuffer; + tr_args.N = (int)C_out; + tr_args.M = (int)C_in; + tr_args.dim = NULL; + tr_args.transposed_axes = NULL; + tr_args.n_dim = 0; + pi_cl_team_fork(NUM_CORES, transpose_matrix, &tr_args); + + // Step 2: GEMM: dX[C_in, H*W] = W^T[C_in, C_out] x dY[C_out, H*W] + struct matMul_args mm_args; + mm_args.A = pTransposeBuffer; // [C_in, C_out] + mm_args.B = (float *)pGradOut; // [C_out, H*W] + mm_args.C = pGradIn; // [C_in, H*W] + mm_args.N = (int)C_in; + mm_args.M = (int)(H_out * W_out); + mm_args.K = (int)C_out; + mm_args.trans_B = 0; + pi_cl_team_fork(NUM_CORES, mm, &mm_args); +} + +// Tile-aware Im2Col-based ConvGradX kernel with offset support +void PULP_ConvGradX2d_fp32_fp32_fp32_CHW_Im2Col_tiled( + const float *__restrict__ pGradOut, // dY tile (L1) + uint32_t dim_im_out_x, // dY tile H + uint32_t dim_im_out_y, // dY tile W + uint32_t ch_im_out, // C_out (full) + const float *__restrict__ pWeight, // W + uint32_t ch_im_in, // C_in (full) + uint32_t dim_kernel_x, // P (kernel H) + uint32_t dim_kernel_y, // Q (kernel W) + uint32_t stride_h, // stride H + uint32_t stride_w, // stride W + float *__restrict__ pGradIn, // dX tile (L1) + uint32_t dim_im_in_x, // dX tile H + uint32_t dim_im_in_y, // dX tile W + uint32_t padding_y_top, // pad top (tile-specific) + uint32_t padding_y_bottom, // pad bottom (tile-specific) + uint32_t padding_x_left, // pad left (tile-specific) + uint32_t padding_x_right, // pad right (tile-specific) + uint16_t offset_grad_in_h, // dX tile offset H (global) + uint16_t offset_grad_in_w, // dX tile offset W (global) + uint16_t offset_grad_out_h, // dY tile offset H (global) + uint16_t offset_grad_out_w, // dY tile offset W (global) + float *__restrict__ ctxtBuffer, + uint32_t ctxtBufferSize, + float *__restrict__ btBuffer, + uint32_t btBufferSize +) { + const uint32_t Hout_t = dim_im_out_x; + const uint32_t Wout_t = dim_im_out_y; + const uint32_t Hin_t = dim_im_in_x; + const uint32_t Win_t = dim_im_in_y; + + const uint32_t Cout = ch_im_out; + const uint32_t Cin = ch_im_in; + + const uint32_t P = dim_kernel_x; + const uint32_t Q = dim_kernel_y; + + const int32_t pad_top = (int32_t)padding_y_top; + const int32_t pad_left = (int32_t)padding_x_left; + + const int32_t sh = (int32_t)stride_h; + const int32_t sw = (int32_t)stride_w; + + const int32_t hx0 = (int32_t)offset_grad_in_h; + const int32_t wx0 = (int32_t)offset_grad_in_w; + const int32_t hx1 = hx0 + (int32_t)Hin_t - 1; + const int32_t wx1 = wx0 + (int32_t)Win_t - 1; + + // Core partition over Cin + const int core_id = pi_core_id(); + const int ncores = NUM_CORES; + + const uint32_t ci_chunk = (Cin + (uint32_t)ncores - 1u) / (uint32_t)ncores; + const uint32_t ci_start = (uint32_t)core_id * ci_chunk; + uint32_t ci_stop = ci_start + ci_chunk; + if (ci_stop > Cin) ci_stop = Cin; + + if (ci_start >= ci_stop) { + return; + } + + // Initialize output tile to zero + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * Hin_t * Win_t; + for (uint32_t ih = 0; ih < Hin_t; ++ih) { + for (uint32_t iw = 0; iw < Win_t; ++iw) { + dx_ci[ih * Win_t + iw] = 0.0f; + } + } + } + + // Compute gradient using tile-aware mapping + for (uint32_t co = 0; co < Cout; ++co) { + const float *dy_co = pGradOut + (size_t)co * Hout_t * Wout_t; + + for (uint32_t ly = 0; ly < Hout_t; ++ly) { + const int32_t oy = (int32_t)offset_grad_out_h + (int32_t)ly; + const int32_t base_h = oy * sh - pad_top; + + for (uint32_t lx = 0; lx < Wout_t; ++lx) { + const int32_t ox = (int32_t)offset_grad_out_w + (int32_t)lx; + const int32_t base_w = ox * sw - pad_left; + + const float dy_val = dy_co[ly * Wout_t + lx]; + + // Prune kernel positions + int32_t ky_min = (hx0 > base_h) ? (hx0 - base_h) : 0; + int32_t ky_max = (hx1 < base_h + (int32_t)P - 1) ? (hx1 - base_h) : ((int32_t)P - 1); + if (ky_min > ky_max) continue; + + int32_t kx_min = (wx0 > base_w) ? (wx0 - base_w) : 0; + int32_t kx_max = (wx1 < base_w + (int32_t)Q - 1) ? (wx1 - base_w) : ((int32_t)Q - 1); + if (kx_min > kx_max) continue; + + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * Hin_t * Win_t; + + // W[co,ci,:,:] base (layout [Cout][Cin][P][Q]) + const float *w_co_ci = pWeight + (((size_t)co * (size_t)Cin + (size_t)ci) * (size_t)P * (size_t)Q); + + for (int32_t ky = ky_min; ky <= ky_max; ++ky) { + const int32_t ih = (base_h + ky) - hx0; // local tile coordinate + + for (int32_t kx = kx_min; kx <= kx_max; ++kx) { + const int32_t iw = (base_w + kx) - wx0; // local tile coordinate + + dx_ci[(uint32_t)ih * Win_t + (uint32_t)iw] += + dy_val * w_co_ci[(size_t)ky * (size_t)Q + (size_t)kx]; + } + } + } + } + } + } +} + + +void PULP_DWConvGradX2d_fp32_fp32_fp32_CHW_tiled( + const float *__restrict__ pGradOut, + uint32_t dim_im_out_x, // H_out (tile) + uint32_t dim_im_out_y, // W_out (tile) + uint32_t ch_im_out, // C_out (full) + const float *__restrict__ pWeight, + uint32_t ch_im_in, // C_in (full) + uint32_t dim_kernel_x, // P (kernel H) + uint32_t dim_kernel_y, // Q (kernel W) + uint32_t stride_h, // SH + uint32_t stride_w, // SW + float *__restrict__ pGradIn, + uint32_t dim_im_in_x, // H_in (tile) + uint32_t dim_im_in_y, // W_in (tile) + uint32_t padding_x_left, // pad_top + uint32_t padding_x_right, // pad_bottom (unused here) + uint32_t padding_y_top, // pad_left + uint32_t padding_y_bottom,// pad_right (unused here) + uint16_t offset_grad_in_h, + uint16_t offset_grad_in_w, + uint16_t offset_grad_out_h, + uint16_t offset_grad_out_w +){ + (void)padding_x_right; + (void)padding_y_bottom; + + const uint32_t Hout_t = dim_im_out_x; + const uint32_t Wout_t = dim_im_out_y; + const uint32_t Hin_t = dim_im_in_x; + const uint32_t Win_t = dim_im_in_y; + + const uint32_t Cout_full = ch_im_out; + const uint32_t Cin_full = ch_im_in; + + const uint32_t P = dim_kernel_x; + const uint32_t Q = dim_kernel_y; + + const int32_t pad_top = (int32_t)padding_x_left; + const int32_t pad_left = (int32_t)padding_y_top; + + const int32_t sh = (int32_t)stride_h; + const int32_t sw = (int32_t)stride_w; + + // dx tile global box [hx0..hx1] x [wx0..wx1] + const int32_t hx0 = (int32_t)offset_grad_in_h; + const int32_t wx0 = (int32_t)offset_grad_in_w; + const int32_t hx1 = hx0 + (int32_t)Hin_t - 1; + const int32_t wx1 = wx0 + (int32_t)Win_t - 1; + + // -------- Grouped/Depthwise Convolution Parameters -------- + // For depthwise: groups = Cin, channels_per_group_in = 1 + // For grouped: groups divides both Cin and Cout + // Assume groups = Cin (standard depthwise with multiplier) + const uint32_t groups = Cin_full; + const uint32_t channels_per_group_out = Cout_full / groups; + + // -------- core partition over input channels -------- + const int core_id = pi_core_id(); + const int ncores = NUM_CORES; + + const uint32_t ci_chunk = (Cin_full + (uint32_t)ncores - 1u) / (uint32_t)ncores; + const uint32_t ci_start = (uint32_t)core_id * ci_chunk; + uint32_t ci_stop = ci_start + ci_chunk; + if (ci_stop > Cin_full) ci_stop = Cin_full; + + if (ci_start >= ci_stop) { + return; + } + + // ---- Clear dx tile for this core's input channels ---- + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * (size_t)Hin_t * (size_t)Win_t; + + for (uint32_t ih = 0; ih < Hin_t; ++ih) { + float *row = dx_ci + (size_t)ih * (size_t)Win_t; + for (uint32_t iw = 0; iw < Win_t; ++iw) { + row[iw] = 0.0f; + } + } + } + + // ---- Main computation: scatter from dy to dx ---- + // For each input channel assigned to this core + for (uint32_t ci = ci_start; ci < ci_stop; ++ci) { + float *dx_ci = pGradIn + (size_t)ci * (size_t)Hin_t * (size_t)Win_t; + + // Determine which output channels contribute to this input channel + // For depthwise with multiplier: input channel ci corresponds to + // output channels [ci * channels_per_group_out, (ci+1) * channels_per_group_out) + const uint32_t co_start = ci * channels_per_group_out; + const uint32_t co_stop = co_start + channels_per_group_out; + + // Accumulate gradients from all corresponding output channels + for (uint32_t co = co_start; co < co_stop; ++co) { + const float *dy_co = pGradOut + (size_t)co * (size_t)Hout_t * (size_t)Wout_t; + + // DW weight layout: [Cout][1][P][Q] -> for channel co, weights at [co][P][Q] + const float *w_co = pWeight + (size_t)co * (size_t)P * (size_t)Q; + + // ---- Scatter from dy tile into dx tile ---- + for (uint32_t ly = 0; ly < Hout_t; ++ly) { + const int32_t oy = (int32_t)offset_grad_out_h + (int32_t)ly; + const int32_t base_h = oy * sh - pad_top; + + for (uint32_t lx = 0; lx < Wout_t; ++lx) { + const int32_t ox = (int32_t)offset_grad_out_w + (int32_t)lx; + const int32_t base_w = ox * sw - pad_left; + + const float dy_val = dy_co[ly * Wout_t + lx]; + + // Intersect kernel footprint with dx tile bounds + int32_t ky_min = max_i32(0, hx0 - base_h); + int32_t ky_max = min_i32((int32_t)P - 1, hx1 - base_h); + if (ky_min > ky_max) continue; + + int32_t kx_min = max_i32(0, wx0 - base_w); + int32_t kx_max = min_i32((int32_t)Q - 1, wx1 - base_w); + if (kx_min > kx_max) continue; + + for (int32_t ky = ky_min; ky <= ky_max; ++ky) { + const int32_t ih = (base_h + ky) - hx0; // local in dx tile + + for (int32_t kx = kx_min; kx <= kx_max; ++kx) { + const int32_t iw = (base_w + kx) - wx0; + + const size_t w_idx = + (size_t)(uint32_t)ky * (size_t)Q + + (size_t)(uint32_t)kx; + + dx_ci[(size_t)(uint32_t)ih * (size_t)Win_t + (size_t)(uint32_t)iw] += + dy_val * w_co[w_idx]; + } + } + } + } + } + } +} + + + + + + + + + + + diff --git a/TargetLibraries/PULPOpen/src/GlobalAveragePool.c b/TargetLibraries/PULPOpen/src/GlobalAveragePool.c new file mode 100644 index 0000000000..baebbad2ee --- /dev/null +++ b/TargetLibraries/PULPOpen/src/GlobalAveragePool.c @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + + +void PULP_GlobalAveragePool_fp32(const float32_t *input, float32_t *output, + uint32_t N, uint32_t C, uint32_t H, uint32_t W) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + uint32_t HW = H * W; + float32_t inv_HW = 1.0f / (float32_t)HW; + + for (uint32_t n = 0; n < N; ++n) { + for (uint32_t c = ch_start; c < ch_stop; ++c) { + float32_t sum = 0.0f; + uint32_t in_base = (n * C + c) * HW; + for (uint32_t i = 0; i < HW; ++i) { + sum += input[in_base + i]; + } + output[n * C + c] = sum * inv_HW; + } + } +} + + +void PULP_GlobalAveragePoolGrad_fp32(const float32_t *dY, float32_t *dX, + uint32_t N, uint32_t C, uint32_t H, uint32_t W) { + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + uint32_t HW = H * W; + float32_t inv_HW = 1.0f / (float32_t)HW; + + for (uint32_t n = 0; n < N; ++n) { + for (uint32_t c = ch_start; c < ch_stop; ++c) { + float32_t dy_val = dY[n * C + c] * inv_HW; + uint32_t out_base = (n * C + c) * HW; + for (uint32_t i = 0; i < HW; ++i) { + dX[out_base + i] = dy_val; + } + } + } +} diff --git a/TargetLibraries/PULPOpen/src/Layernorm.c b/TargetLibraries/PULPOpen/src/Layernorm.c index 9324ff19ee..a1c0a02570 100644 --- a/TargetLibraries/PULPOpen/src/Layernorm.c +++ b/TargetLibraries/PULPOpen/src/Layernorm.c @@ -10,9 +10,20 @@ #include +/* + * Forward pass: LayerNorm with stash output + * + * Normalizes along the last dimension (axis=-1). + * Parallelized across sequence positions: each core processes a chunk of + * sequences. In addition to the normalized output (data_out), writes the + * per-sequence mean and 1/sqrt(var+eps) into mean_out and inv_std_dev_out + * so the backward pass can reuse them without recomputation. + */ void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, - float32_t *scale, float32_t *bias, uint32_t size, - uint32_t lastDimLength, float32_t epsilon) { + float32_t *scale, float32_t *bias, + float32_t *mean_out, float32_t *inv_std_dev_out, + uint32_t size, uint32_t lastDimLength, + float32_t epsilon) { int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); @@ -29,36 +40,118 @@ void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out, float32_t *local_data_in = data_in + elem_start; float32_t *local_data_out = data_out + elem_start; int32_t local_size = elem_end - elem_start; - - float32_t mean; - float32_t sum; - float32_t std; - float32_t temp; - int32_t local_seq_count = local_size / lastDimLength; for (int32_t i = 0; i < local_seq_count; i++) { + float32_t *row_in = local_data_in + i * lastDimLength; + float32_t *row_out = local_data_out + i * lastDimLength; - sum = 0.0f; - mean = 0.0f; + /* Compute mean */ + float32_t mean = 0.0f; for (int32_t j = 0; j < lastDimLength; j++) { - mean += local_data_in[j + i * lastDimLength]; + mean += row_in[j]; } - mean = mean / (float32_t)lastDimLength; + mean /= (float32_t)lastDimLength; - sum = 0.0f; + /* Compute variance */ + float32_t var = 0.0f; for (int32_t j = 0; j < lastDimLength; j++) { - temp = local_data_in[j + i * lastDimLength] - mean; - sum += temp * temp; + float32_t diff = row_in[j] - mean; + var += diff * diff; } - sum = sum / (float32_t)lastDimLength; - sum += epsilon; - std = sqrtf(sum); + var /= (float32_t)lastDimLength; + + float32_t isd = 1.0f / sqrtf(var + epsilon); + /* Write stash (indexed by global sequence position) */ + mean_out[start_seq + i] = mean; + inv_std_dev_out[start_seq + i] = isd; + + /* Compute normalized output */ for (int32_t j = 0; j < lastDimLength; j++) { - local_data_out[j + i * lastDimLength] = - ((local_data_in[j + i * lastDimLength] - mean) / std) * scale[j] + - bias[j]; + row_out[j] = (row_in[j] - mean) * isd * scale[j] + bias[j]; + } + } +} + +/* + * Backward pass: compute dX for a chunk of sequences. + * + * Uses the pre-computed mean and inv_std_dev stash from the forward pass. + * Called per-core with each core's local chunk. + * + * Math (standard LayerNorm backward with axis=-1): + * x_hat[i] = (x[i] - mean) * isd + * mean_dy = sum(dy) / N + * mean_dy_xhat = sum(dy * x_hat) / N + * dx[i] = gamma[i] * isd * (dy[i] - mean_dy - x_hat[i] * mean_dy_xhat) + */ +void PULP_LayernormGrad_fp32_fp32(const float32_t *dy, const float32_t *x, + const float32_t *mean, const float32_t *inv_std_dev, + float32_t *dx, const float32_t *gamma, + uint32_t elem_count, uint32_t lastDimLength) { + + uint32_t seq_count = elem_count / lastDimLength; + + for (uint32_t s = 0; s < seq_count; s++) { + const float32_t *dy_s = dy + s * lastDimLength; + const float32_t *x_s = x + s * lastDimLength; + float32_t *dx_s = dx + s * lastDimLength; + float32_t m = mean[s]; + float32_t isd = inv_std_dev[s]; + + /* Accumulate sum(gamma*dy) and sum(gamma*dy*x_hat) */ + float32_t sum_gdy = 0.0f; + float32_t sum_gdy_xhat = 0.0f; + for (uint32_t i = 0; i < lastDimLength; i++) { + float32_t x_hat_i = (x_s[i] - m) * isd; + float32_t gdy_i = gamma[i] * dy_s[i]; + sum_gdy += gdy_i; + sum_gdy_xhat += gdy_i * x_hat_i; + } + float32_t mean_gdy = sum_gdy / (float32_t)lastDimLength; + float32_t mean_gdy_xhat = sum_gdy_xhat / (float32_t)lastDimLength; + + /* Compute dX: dx[i] = isd * (gamma[i]*dy[i] - mean(gamma*dy) - x_hat[i]*mean(gamma*dy*x_hat)) */ + for (uint32_t i = 0; i < lastDimLength; i++) { + float32_t x_hat_i = (x_s[i] - m) * isd; + dx_s[i] = isd * (gamma[i] * dy_s[i] - mean_gdy - x_hat_i * mean_gdy_xhat); + } + } +} + +/* + * Backward pass: compute dscale (dgamma) and dbias (dbeta) over all sequences. + * + * Called from core 0 only. Uses pre-computed mean and inv_std_dev stash. + * + * Math: + * dgamma[i] = sum_s( dy[s,i] * (x[s,i] - mean[s]) * isd[s] ) + * dbeta[i] = sum_s( dy[s,i] ) + */ +void PULP_LayernormGradParam_fp32_fp32(const float32_t *dy, const float32_t *x, + const float32_t *mean, const float32_t *inv_std_dev, + float32_t *dgamma, float32_t *dbeta, + uint32_t size, uint32_t lastDimLength) { + + uint32_t seq_length = size / lastDimLength; + + /* Initialize output gradients */ + for (uint32_t i = 0; i < lastDimLength; i++) { + dgamma[i] = 0.0f; + dbeta[i] = 0.0f; + } + + for (uint32_t s = 0; s < seq_length; s++) { + const float32_t *dy_s = dy + s * lastDimLength; + const float32_t *x_s = x + s * lastDimLength; + float32_t m = mean[s]; + float32_t isd = inv_std_dev[s]; + + for (uint32_t i = 0; i < lastDimLength; i++) { + float32_t x_hat_i = (x_s[i] - m) * isd; + dgamma[i] += dy_s[i] * x_hat_i; + dbeta[i] += dy_s[i]; } } -} \ No newline at end of file +} diff --git a/TargetLibraries/PULPOpen/src/MaxPool.c b/TargetLibraries/PULPOpen/src/MaxPool.c index 3b630b97cc..590ea6e42a 100644 --- a/TargetLibraries/PULPOpen/src/MaxPool.c +++ b/TargetLibraries/PULPOpen/src/MaxPool.c @@ -60,4 +60,71 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, } } } +} + +void PULP_MaxPoolGrad2d_fp32_fp32_HWC(const float32_t *__restrict__ pGradOut, + const float32_t *__restrict__ pInput, + uint32_t H_out, uint32_t W_out, uint32_t C, + uint32_t H_in, uint32_t W_in, + uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, float32_t *__restrict__ pGradIn, + uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) { + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0); + uint16_t ch_start = MIN(ch_chunk * core_id, C); + uint16_t ch_stop = MIN(ch_start + ch_chunk, C); + + /* Zero-initialise the gradient input for our channel slice */ + for (uint32_t h = 0; h < H_in; ++h) { + for (uint32_t w = 0; w < W_in; ++w) { + for (uint32_t c = ch_start; c < ch_stop; ++c) { + pGradIn[(h * W_in + w) * C + c] = 0.0f; + } + } + } + + /* Scatter upstream gradient to the argmax position in each pooling window */ + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + + int32_t h_in_start = (int32_t)h_out * (int32_t)SP - (int32_t)pad_top; + int32_t w_in_start = (int32_t)w_out * (int32_t)SQ - (int32_t)pad_left; + + for (uint32_t c = ch_start; c < ch_stop; ++c) { + + /* Find the argmax position within the pooling window */ + float32_t max_val = -inf; + int32_t max_h = -1; + int32_t max_w = -1; + + for (uint32_t p = 0; p < P; ++p) { + int32_t h_in = h_in_start + (int32_t)p; + if (h_in < 0 || h_in >= (int32_t)H_in) continue; + + for (uint32_t q = 0; q < Q; ++q) { + int32_t w_in = w_in_start + (int32_t)q; + if (w_in < 0 || w_in >= (int32_t)W_in) continue; + + float32_t val = pInput[((uint32_t)h_in * W_in + (uint32_t)w_in) * C + c]; + if (val > max_val) { + max_val = val; + max_h = h_in; + max_w = w_in; + } + } + } + + /* Accumulate upstream gradient at the argmax position */ + if (max_h >= 0 && max_w >= 0) { + uint32_t out_idx = (h_out * W_out + w_out) * C + c; + uint32_t in_idx = ((uint32_t)max_h * W_in + (uint32_t)max_w) * C + c; + pGradIn[in_idx] += pGradOut[out_idx]; + } + } + } + } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Relu.c b/TargetLibraries/PULPOpen/src/Relu.c index 4e309bc092..fa8cf3dbce 100644 --- a/TargetLibraries/PULPOpen/src/Relu.c +++ b/TargetLibraries/PULPOpen/src/Relu.c @@ -23,4 +23,25 @@ void PULP_Relu_fp32_fp32(float32_t *input, float32_t *output, uint32_t size) { for (int32_t i = 0; i < local_size; i++) { local_output[i] = MAX(local_input[i], 0.0f); } +} + +void PULP_ReluGrad_fp32_fp32(float32_t *grad_out, float32_t *data_in, + float32_t *grad_in, uint32_t size) { + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + int32_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0); + int32_t start = MIN(chunk * core_id, size); + int32_t end = MIN(start + chunk, size); + int32_t local_size = end - start; + + float32_t *local_grad_out = grad_out + start; + float32_t *local_data_in = data_in + start; + float32_t *local_grad_in = grad_in + start; + + for (int32_t i = 0; i < local_size; i++) { + // If input > 0, gradient flows through; otherwise gradient is 0 + local_grad_in[i] = (local_data_in[i] > 0.0f) ? local_grad_out[i] : 0.0f; + } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/third_party/pulp-trainlib b/TargetLibraries/PULPOpen/third_party/pulp-trainlib new file mode 160000 index 0000000000..37f70e5d3c --- /dev/null +++ b/TargetLibraries/PULPOpen/third_party/pulp-trainlib @@ -0,0 +1 @@ +Subproject commit 37f70e5d3ca1757dff6fed32980e938802a4f20a