From 163e4e41d174cdd9ca589013085a3b940e1bfaf8 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:25:25 +0100 Subject: [PATCH 01/21] wip:SYSTEMDS-3543 --- .../compress/colgroup/ColGroupFactory.java | 182 ++++++++++++++++++ ...ColGroupPiecewiseLinearCompressedTest.java | 70 +++++++ 2 files changed, 252 insertions(+) create mode 100644 src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index c6a098f5c32..778aeb1adb8 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -50,6 +50,7 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory; import org.apache.sysds.runtime.compress.colgroup.offset.AOffset; import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory; +import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.cost.ACostEstimate; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; @@ -936,6 +937,187 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { + + + //Erstmal den Inhalt einer Spalte speichern + + int numRows = in.getNumRows(); + int colIdx = colIndexes.get(0); //Die erste Spalte + double[] column = getColumn(in,colIdx); + + //Sette den Targetloss + + // Breakpoints bestimmen: Einteilung der Segmente + + List breakpointsList = computeBreakpoints(cs, column); + int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + //Für jedes Segment lineare Regression als kompressionsverfahren + + // 3) Pro Segment Regression -> a,b + int numSeg = breakpoints.length - 1; + double[] slopes = new double[numSeg]; + double[] intercepts = new double[numSeg]; + + for (int s = 0; s < numSeg; s++) { + int start = breakpoints[s]; + int end = breakpoints[s + 1]; + + double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost + slopes[s] = ab[0]; + intercepts[s] = ab[1]; + } + //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed + + return ColGroupPiecewiseLinearCompressed.create( + colIndexes, + breakpoints, + slopes, + intercepts, + numRows); + } + + + public static double[] getColumn(MatrixBlock in, int colIndex) { + int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] + double[] column = new double[numRows]; // Variable für die Spalte + + for (int r = 0; r < numRows; r++) { + column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] + } + return column; + } + public static List computeBreakpoints(CompressionSettings cs, double[] column){ + int n = column.length; + double targetMSE = cs.getPiecewiseTargetLoss(); // nur lesen, NICHT setzen! + + // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ + if (Double.isNaN(targetMSE) || targetMSE <= 0) { + double lambda = 5.0; + return computeBreakpointsLambda(column, lambda); + } + + // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen + double sseMax = n * targetMSE; // MSE -> SSE-Budget + + double lambdaMin = 0.0; // viele Segmente, minimaler Fehler + double lambdaMax = 1e6; // wenige Segmente, mehr Fehler + + List bestBreaks = null; + + for (int it = 0; it < 20; it++) { // Binärsuche auf λ + double lambda = 0.5 * (lambdaMin + lambdaMax); + + List breaks = computeBreakpointsLambda(column, lambda); + double totalSSE = computeTotalSSE(column, breaks); + + if (totalSSE <= sseMax) { + // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen + bestBreaks = breaks; + lambdaMin = lambda; + } else { + // Fehler zu groß: λ verkleinern, mehr Segmente zulassen + lambdaMax = lambda; + } + } + + if (bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, lambdaMin); + + return bestBreaks; + } + public static List computeBreakpointsLambda(double[] column, double lambda) { + int sizeColumn = column.length; + double[] dp = new double[sizeColumn + 1]; + int[] prev = new int[sizeColumn + 1]; + + dp[0] = 0.0; + + for (int index = 1; index <= sizeColumn; index++) { + dp[index] = Double.POSITIVE_INFINITY; + for (int i = 0; i < index; i++) { // Segment [i, index) + double costCurrentSegment = computeSegmentCost(column, i, index); // SSE + double candidateCost = dp[i] + costCurrentSegment + lambda; + if (candidateCost < dp[index]) { + dp[index] = candidateCost; + prev[index] = i; + } + } + } + + List segmentLimits = new ArrayList<>(); + int breakpointIndex = sizeColumn; + while (breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prev[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + int n = end - start; + if (n <= 1) + return 0.0; + + double[] ab = regressSegment(column, start, end); + double slope = ab[0]; + double intercept = ab[1]; + + double sse = 0.0; + for (int i = start; i < end; i++) { + double x = i; + double y = column[i]; + double yhat = slope * x + intercept; + double diff = y - yhat; + sse += diff * diff; + } + return sse; // oder sse / n als MSE + } + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for (int s = 0; s < breaks.size() - 1; s++) { + int start = breaks.get(s); + int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); // SSE des Segments + } + return total; + } + + + public static double[] regressSegment(double[] column, int start, int end) { + int n = end - start; + if (n <= 0) + return new double[] {0.0, 0.0}; + + double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; + for (int i = start; i < end; i++) { + double x = i; + double y = column[i]; + sumX += x; + sumY += y; + sumXX += x * x; + sumXY += x * y; + } + + double nD = n; + double denom = nD * sumXX - sumX * sumX; + double slope, intercept; + if (denom == 0) { + slope = 0.0; + intercept = sumY / nD; + } + else { + slope = (nD * sumXY - sumX * sumY) / denom; + intercept = (sumY - slope * sumX) / nD; + } + return new double[] {slope, intercept}; + } + + + + private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { if(cols.size() > 1) return compressMultiColSDCFromSparseTransposedBlock(cols, nrUniqueEstimate, tupleSparsity); diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java new file mode 100644 index 00000000000..3e13c5756ac --- /dev/null +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -0,0 +1,70 @@ +package org.apache.sysds.runtime.compress.colgroup; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; + +import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeBreakpoints; +import static org.junit.Assert.*; + +/** + * Tests für PiecewiseLinearColGroupCompressed, fokussiert auf: + * - Konstruktor / create(...) + * - decompressToDenseBlock(...) + */ +public class ColGroupPiecewiseLinearCompressedTest { + + private CompressionSettings cs; + // ------------------------------------------------------------- + // 1. create(...) und Konstruktor + // ------------------------------------------------------------- + + @BeforeEach + void setUp() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + + } + + @Test + public void testComputeBreakpoints_uniformColumn() { + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0), breaks); // Erwartet: keine Breaks + } + + @Test + public void testComputeBreakpoints_linearIncreasing() { + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + } + + @Test + public void testComputeBreakpoints_highLoss_uniform() { + cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0), breaks); + } + + @Test + public void testComputeBreakpoints_noLoss_linear() { + cs.setPiecewiseTargetLoss(0.0); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 1, 2, 3), breaks); // bei 0 Loss alle Breaks + } + + +} \ No newline at end of file From f5df4eac8135b49e6a4b0f840ed4dd56b9d2f029 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:17:59 +0100 Subject: [PATCH 02/21] =?UTF-8?q?Meine=20lokalen=20=C3=84nderungen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/systemds-standalone.sh | 12 + pom.xml | 8 +- .../runtime/compress/CompressionSettings.java | 29 +- .../compress/CompressionSettingsBuilder.java | 13 +- .../ColGroupPiecewiseLinearCompressed.java | 371 ++++++++++++++++++ .../colgroup/ColGroupFactoryTest.java | 5 + use-java17-systemds.sh | 57 +++ 7 files changed, 488 insertions(+), 7 deletions(-) create mode 100755 bin/systemds-standalone.sh create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java create mode 100755 use-java17-systemds.sh diff --git a/bin/systemds-standalone.sh b/bin/systemds-standalone.sh new file mode 100755 index 00000000000..9efaa963a4b --- /dev/null +++ b/bin/systemds-standalone.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Standalone-Launcher für SystemDS + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" + +if [ ! -f "$JAR_FILE" ]; then + echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" + exit 1 +fi + +java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" diff --git a/pom.xml b/pom.xml index e0b3f794272..c0221cd11d5 100644 --- a/pom.xml +++ b/pom.xml @@ -1548,5 +1548,11 @@ fastdoubleparser 0.9.0 - + + org.junit.jupiter + junit-jupiter + RELEASE + test + + diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index f6321bc1b6d..c5d98019947 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -21,6 +21,7 @@ import java.util.EnumSet; +import com.fasterxml.jackson.annotation.JsonAnySetter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType; @@ -39,6 +40,22 @@ public class CompressionSettings { /** Parallelization threshold for DDC compression */ public static int PAR_DDC_THRESHOLD = 10000; + /** + * Ziel-Gesamtverlust für piecewise lineare Kompression. + * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. + * 0.0 ~ quasi verlustfrei, viele Segmente + * >0 ~ mehr Approximation erlaubt, weniger Segmente + */ + + private double piecewiseTargetLoss = Double.NaN; + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + } + public double getPiecewiseTargetLoss() { + return piecewiseTargetLoss; + } + /** * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not * Character max value + 1 because it breaks the offsets in cases with fully dense values. @@ -133,11 +150,11 @@ public class CompressionSettings { public final double[] scaleFactors; - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { + public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { this.samplingRatio = samplingRatio; this.samplePower = samplePower; this.allowSharedDictionary = allowSharedDictionary; @@ -181,4 +198,6 @@ public String toString() { sb.append("\t Estimation Type: " + estimationType); return sb.toString(); } + + } diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index ae6a0b2d231..00375753d6f 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -54,7 +54,7 @@ public class CompressionSettingsBuilder { private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; private double[] scaleFactors = null; - public CompressionSettingsBuilder() { + public CompressionSettingsBuilder() { DMLConfig conf = ConfigurationManager.getDMLConfig(); this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); @@ -210,6 +210,17 @@ public CompressionSettingsBuilder addValidCompression(CompressionType cp) { return this; } + /** + * Ziel-Gesamtverlust für piecewise lineare Kompression. + * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. + * 0.0 ~ quasi verlustfrei, viele Segmente + * >0 ~ mehr Approximation erlaubt, weniger Segmente + + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + }*/ + /** * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. * Since this is required for operation of the compression diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java new file mode 100644 index 00000000000..e9e4cd1572b --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -0,0 +1,371 @@ +package org.apache.sysds.runtime.compress.colgroup.scheme; + +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed; +import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils; +import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator; +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; +import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.SparseBlockMCSR; +import org.apache.sysds.runtime.functionobjects.Builtin; +import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.operators.BinaryOperator; +import org.apache.sysds.runtime.matrix.operators.CMOperator; +import org.apache.sysds.runtime.matrix.operators.ScalarOperator; +import org.apache.sysds.runtime.matrix.operators.UnaryOperator; + +import java.util.Arrays; + +public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { + + IColIndex colIndexes; + int[] breakpoints; + double[] slopes; + double[] intercepts; + int numRows; + + protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { + super(colIndices); + } + + + public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { + super(colIndexes); + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + this.numRows = numRows; + } + + + + + + public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { + if (breakpoints == null || breakpoints.length < 2) + throw new IllegalArgumentException("Need at least one segment"); + + int numSeg = breakpoints.length - 1; + if (slopes.length != numSeg || intercepts.length != numSeg) + throw new IllegalArgumentException("Inconsistent segment arrays"); + + int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); + double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); + double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); + + + return new ColGroupPiecewiseLinearCompressed( + colIndexes, + bpCopy, + slopeCopy, + interceptCopy, + numRows); + + } + + @Override + public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { + final int col = colIndexes.get(0); // bei mehreren Spalten: Schleife + + // Hole das interne double[] für die Zielspalte(n) + // DenseBlock ist meist row-major, Zugriff per db.values(…) + // Einfachste Variante: Zeilenweise über db.getBlockValues(...) arbeiten. + + final int numSeg = breakpoints.length - 1; + + for (int s = 0; s < numSeg; s++) { + final int segStart = breakpoints[s]; + final int segEnd = breakpoints[s + 1]; + final double a = slopes[s]; + final double b = intercepts[s]; + + // Segment auf angefragten Bereich einschränken + final int rs = Math.max(segStart, rl); + final int re = Math.min(segEnd, ru); + if (rs >= re) + continue; + + for (int r = rs; r < re; r++) { + double x = r; // selbes x wie beim Fit + double yhat = a * x + b; + + // globale Position im DenseBlock + int gr = r + offR; + int gc = col + offC; + + // db.set(row, col, value) + db.set(gr, gc, yhat); + } + } + } + @Override + protected double computeMxx(double c, Builtin builtin) { + return 0; + } + + @Override + protected void computeColMxx(double[] c, Builtin builtin) { + + } + + @Override + protected void computeSum(double[] c, int nRows) { + + } + + @Override + protected void computeSumSq(double[] c, int nRows) { + + } + + @Override + protected void computeColSumsSq(double[] c, int nRows) { + + } + + @Override + protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeProduct(double[] c, int nRows) { + + } + + @Override + protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + + } + + @Override + protected void computeColProduct(double[] c, int nRows) { + + } + + @Override + protected double[] preAggSumRows() { + return new double[0]; + } + + @Override + protected double[] preAggSumSqRows() { + return new double[0]; + } + + @Override + protected double[] preAggProductRows() { + return new double[0]; + } + + @Override + protected double[] preAggBuiltinRows(Builtin builtin) { + return new double[0]; + } + + @Override + public boolean sameIndexStructure(AColGroupCompressed that) { + return false; + } + + @Override + protected void tsmm(double[] result, int numColumns, int nRows) { + + } + + @Override + public AColGroup copyAndSet(IColIndex colIndexes) { + return null; + } + + @Override + public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + + } + + @Override + public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + + } + + @Override + public double getIdx(int r, int colIdx) { + return 0; + } + + @Override + public int getNumValues() { + return 0; + } + + @Override + public CompressionType getCompType() { + return null; + } + + @Override + protected ColGroupType getColGroupType() { + return null; + } + + + + @Override + public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { + + } + + @Override + public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { + return null; + } + + @Override + public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + + } + + @Override + public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + + } + + @Override + public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + + } + + @Override + public AColGroup scalarOperation(ScalarOperator op) { + return null; + } + + @Override + public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + protected AColGroup sliceSingleColumn(int idx) { + return null; + } + + @Override + protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { + return null; + } + + @Override + public AColGroup sliceRows(int rl, int ru) { + return null; + } + + @Override + public boolean containsValue(double pattern) { + return false; + } + + @Override + public long getNumberNonZeros(int nRows) { + return 0; + } + + @Override + public AColGroup replace(double pattern, double replace) { + return null; + } + + @Override + public void computeColSums(double[] c, int nRows) { + + } + + @Override + public CM_COV_Object centralMoment(CMOperator op, int nRows) { + return null; + } + + @Override + public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { + return null; + } + + @Override + public double getCost(ComputationCostEstimator e, int nRows) { + return 0; + } + + @Override + public AColGroup unaryOperation(UnaryOperator op) { + return null; + } + + @Override + public AColGroup append(AColGroup g) { + return null; + } + + @Override + protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { + return null; + } + + @Override + public ICLAScheme getCompressionScheme() { + return null; + } + + @Override + public AColGroup recompress() { + return null; + } + + @Override + public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { + return null; + } + + @Override + protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { + return null; + } + + @Override + public AColGroup reduceCols() { + return null; + } + + @Override + public double getSparsity() { + return 0; + } + + @Override + protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { + return new AColGroup[0]; + } +} + diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java index 0468de4dc04..597e065aab6 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java @@ -19,8 +19,10 @@ package org.apache.sysds.test.component.compress.colgroup; +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeSegmentCost; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Collection; @@ -51,6 +53,7 @@ @RunWith(value = Parameterized.class) public class ColGroupFactoryTest { + private final MatrixBlock mb; private final MatrixBlock mbt; private final ACostEstimate ce; @@ -328,4 +331,6 @@ public int numBlocks() { return 2; } } + + } diff --git a/use-java17-systemds.sh b/use-java17-systemds.sh new file mode 100755 index 00000000000..0c1a2fda871 --- /dev/null +++ b/use-java17-systemds.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# ------------------------------------------------------------------ +# SystemDS macOS Build-Skript +# Setzt JAVA_HOME, PATH, Maven und erzeugt systemds-standalone.sh +# ------------------------------------------------------------------ + +# 1️⃣ Setze Java 17 +export JAVA_HOME=$(/usr/libexec/java_home -v 17) +export PATH="$JAVA_HOME/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/opt/homebrew/bin:/opt/homebrew/sbin:$PATH" + +# 2️⃣ Optional: Python, ghcup, uix/Deno, Coursier, JetBrains Toolbox +export PATH="/Library/Frameworks/Python.framework/Versions/3.11/bin:$HOME/.ghcup/bin:$HOME/.uix/bin:$PATH" +export DENO_INSTALL="$HOME/.uix" +export PATH="$DENO_INSTALL/bin:$PATH" +export PATH="$PATH:/Users/mori/Library/Application Support/Coursier/bin" +export PATH="$PATH:/Users/mori/Library/Application Support/JetBrains/Toolbox/scripts" + +# 3️⃣ Prüfen, ob Maven existiert +if ! command -v mvn >/dev/null 2>&1; then + echo "ERROR: Maven (mvn) nicht gefunden. Bitte installieren!" + exit 1 +fi + +# 4️⃣ Prüfen, ob wir im Projekt-Root sind (pom.xml vorhanden) +if [ ! -f "pom.xml" ]; then + echo "ERROR: pom.xml nicht gefunden. Bitte ins SystemDS-Projekt-Root wechseln." + exit 1 +fi + +# 5️⃣ Maven Build ausführen +echo "📦 Starte Maven Build..." +mvn clean package -DskipTests + +# 6️⃣ Standalone-Skript erzeugen +echo "🔧 Erzeuge bin/systemds-standalone.sh..." + +mkdir -p bin +cat > bin/systemds-standalone.sh << 'EOF' +#!/bin/bash +# Standalone-Launcher für SystemDS + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +JAR_FILE="$SCRIPT_DIR/../target/systemds-3.4.0-SNAPSHOT.jar" + +if [ ! -f "$JAR_FILE" ]; then + echo "ERROR: Standalone JAR nicht gefunden: $JAR_FILE" + exit 1 +fi + +java -cp "$JAR_FILE" org.apache.sysds.api.DMLScript "$@" +EOF + +# 7️⃣ Ausführbar machen +chmod +x bin/systemds-standalone.sh + +echo "✅ Fertig! Standalone-Skript erstellt: bin/systemds-standalone.sh" + From 8f5c844e5ef074a4ad2d8e267bf146604a3c0bde Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:41:46 +0100 Subject: [PATCH 03/21] wip: test --- .../colgroup/ColGroupPiecewiseLinearCompressedTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 3e13c5756ac..5a740624d2d 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -21,6 +21,7 @@ * - Konstruktor / create(...) * - decompressToDenseBlock(...) */ +//TODO Fix public class ColGroupPiecewiseLinearCompressedTest { private CompressionSettings cs; @@ -48,6 +49,7 @@ public void testComputeBreakpoints_linearIncreasing() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column List breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + } @Test From 11415fa06e934a87b7617501b921d40563784098 Mon Sep 17 00:00:00 2001 From: Jannik Lindemann Date: Mon, 26 Jan 2026 10:35:00 +0100 Subject: [PATCH 04/21] Test Fix --- .../runtime/compress/CompressionSettings.java | 8 -------- .../ColGroupPiecewiseLinearCompressed.java | 12 ++++++------ .../ColGroupPiecewiseLinearCompressedTest.java | 18 +++++------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 71c0a7e4d34..b853fd7f3ef 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -150,13 +150,6 @@ public double getPiecewiseTargetLoss() { public final double[] scaleFactors; -<<<<<<< HEAD - public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { -======= public final boolean preferDeltaEncoding; protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, @@ -165,7 +158,6 @@ protected CompressionSettings(double samplingRatio, double samplePower, boolean int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, boolean preferDeltaEncoding) { ->>>>>>> upstream/main this.samplingRatio = samplingRatio; this.samplePower = samplePower; this.allowSharedDictionary = allowSharedDictionary; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index e9e4cd1572b..ec63d3bfb4e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -10,7 +10,7 @@ import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockMCSR; import org.apache.sysds.runtime.functionobjects.Builtin; -import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.instructions.cp.CmCovObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.matrix.operators.CMOperator; @@ -293,12 +293,12 @@ public void computeColSums(double[] c, int nRows) { } - @Override - public CM_COV_Object centralMoment(CMOperator op, int nRows) { - return null; - } + @Override + public CmCovObject centralMoment(CMOperator op, int nRows) { + return null; + } - @Override + @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { return null; } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5a740624d2d..3335fbe5e7c 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -23,20 +23,9 @@ */ //TODO Fix public class ColGroupPiecewiseLinearCompressedTest { - - private CompressionSettings cs; - // ------------------------------------------------------------- - // 1. create(...) und Konstruktor - // ------------------------------------------------------------- - - @BeforeEach - void setUp() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - - } - @Test public void testComputeBreakpoints_uniformColumn() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch List breaks = computeBreakpoints(cs, column); @@ -45,6 +34,7 @@ public void testComputeBreakpoints_uniformColumn() { @Test public void testComputeBreakpoints_linearIncreasing() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column List breaks = computeBreakpoints(cs, column); @@ -54,6 +44,7 @@ public void testComputeBreakpoints_linearIncreasing() { @Test public void testComputeBreakpoints_highLoss_uniform() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1.0); // ← andere Loss double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpoints(cs, column); @@ -62,6 +53,7 @@ public void testComputeBreakpoints_highLoss_uniform() { @Test public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; List breaks = computeBreakpoints(cs, column); @@ -69,4 +61,4 @@ public void testComputeBreakpoints_noLoss_linear() { } -} \ No newline at end of file +} From 5301f8fb61e152bd1e31b7c91b9851d2350cd9ab Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Mon, 26 Jan 2026 20:19:12 +0100 Subject: [PATCH 05/21] wip: test --- .../runtime/compress/CompressionSettings.java | 324 +++++---- .../compress/CompressionSettingsBuilder.java | 613 +++++++++--------- .../compress/colgroup/ColGroupFactory.java | 8 +- .../ColGroupPiecewiseLinearCompressed.java | 5 +- ...ColGroupPiecewiseLinearCompressedTest.java | 380 ++++++++++- 5 files changed, 833 insertions(+), 497 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 71c0a7e4d34..d1f97928975 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -21,7 +21,6 @@ import java.util.EnumSet; -import com.fasterxml.jackson.annotation.JsonAnySetter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.compress.cocode.CoCoderFactory.PartitionerType; @@ -35,181 +34,174 @@ * CompressionSettingsBuilder for default non static parameters. */ public class CompressionSettings { - private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); + private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); - /** Parallelization threshold for DDC compression */ - public static int PAR_DDC_THRESHOLD = 10000; + /** Parallelization threshold for DDC compression */ + public static int PAR_DDC_THRESHOLD = 10000; /** - * Ziel-Gesamtverlust für piecewise lineare Kompression. - * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. - * 0.0 ~ quasi verlustfrei, viele Segmente - * >0 ~ mehr Approximation erlaubt, weniger Segmente + * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not + * Character max value + 1 because it breaks the offsets in cases with fully dense values. */ + public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - private double piecewiseTargetLoss = Double.NaN; + /** + * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease + * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + + * garbage collection increases) + */ + public final boolean sortTuplesByFrequency; + + /** + * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the + * number of elements is below 1000. + * + * DEPRECATED + */ + public final double samplingRatio; + + /** + * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: + * + * sampleSize += nRows^samplePower; + * + * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. + */ + public final double samplePower; + + /** Share DDC Dictionaries between ColGroups. */ + public final boolean allowSharedDictionary; + + /** Boolean specifying which transpose setting is used, can be auto, true or false */ + public final String transposeInput; + + /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ + public final int seed; + + /** True if lossy compression is enabled */ + public final boolean lossy; + + /** The selected method for column partitioning used in CoCoding compressed columns */ + public final PartitionerType columnPartitioner; + + /** The cost computation type for the compression */ + public final CostType costComputationType; + + /** The maximum number of columns CoCoded allowed */ + public final int maxColGroupCoCode; + + /** + * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects + * aggressively likely coCoding is used. + */ + public final double coCodePercentage; + + /** + * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression + * Default is to always allow for Uncompromisable ColGroup. + */ + public final EnumSet validCompressions; + + /** The minimum size of the sample extracted. */ + public final int minimumSampleSize; + /** The maximum size of the sample extracted. */ + public final int maxSampleSize; + + /** The sample type used for sampling */ + public final EstimationType estimationType; + + /** + * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script + * based on the transposeInput setting. + * + * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. + */ + public boolean transposed = false; + + /** The minimum compression ratio to achieve. */ + public final double minimumCompressionRatio; + + + + /** Is a spark instruction */ + public final boolean isInSparkInstruction; + + /** The sorting type used in sorting/joining offsets to create SDC groups */ + public final SORT_TYPE sdcSortType; + + /** if the settings have been logged already. */ + public static boolean printedStatus = false; + + public final double[] scaleFactors; + + public final boolean preferDeltaEncoding; + + /** + * Ziel-Gesantverlust für piecewise Lineace Komocession• + * Interpretation: maximal entaubter Alobaler MSE pro Went in der Sealte. + * O.O ~ quasi verlustfrei, viele Segmente + * >0 + ~ mehr Approximation entaubt, weniger Segmente + */ + private double piecewiseTargetLoss = Double.NaN; public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { this.piecewiseTargetLoss = piecewiseTargetLoss; + } public double getPiecewiseTargetLoss() { return piecewiseTargetLoss; } - /** - * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not - * Character max value + 1 because it breaks the offsets in cases with fully dense values. - */ - public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - - /** - * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease - * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + - * garbage collection increases) - */ - public final boolean sortTuplesByFrequency; - - /** - * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the - * number of elements is below 1000. - * - * DEPRECATED - */ - public final double samplingRatio; - - /** - * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: - * - * sampleSize += nRows^samplePower; - * - * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. - */ - public final double samplePower; - - /** Share DDC Dictionaries between ColGroups. */ - public final boolean allowSharedDictionary; - - /** Boolean specifying which transpose setting is used, can be auto, true or false */ - public final String transposeInput; - - /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ - public final int seed; - - /** True if lossy compression is enabled */ - public final boolean lossy; - - /** The selected method for column partitioning used in CoCoding compressed columns */ - public final PartitionerType columnPartitioner; - - /** The cost computation type for the compression */ - public final CostType costComputationType; - - /** The maximum number of columns CoCoded allowed */ - public final int maxColGroupCoCode; - - /** - * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects - * aggressively likely coCoding is used. - */ - public final double coCodePercentage; - - /** - * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression - * Default is to always allow for Uncompromisable ColGroup. - */ - public final EnumSet validCompressions; - - /** The minimum size of the sample extracted. */ - public final int minimumSampleSize; - - /** The maximum size of the sample extracted. */ - public final int maxSampleSize; - - /** The sample type used for sampling */ - public final EstimationType estimationType; - - /** - * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script - * based on the transposeInput setting. - * - * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. - */ - public boolean transposed = false; - - /** The minimum compression ratio to achieve. */ - public final double minimumCompressionRatio; - - /** Is a spark instruction */ - public final boolean isInSparkInstruction; - - /** The sorting type used in sorting/joining offsets to create SDC groups */ - public final SORT_TYPE sdcSortType; - - /** if the settings have been logged already. */ - public static boolean printedStatus = false; - - public final double[] scaleFactors; - -<<<<<<< HEAD - public CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) { -======= - public final boolean preferDeltaEncoding; - - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, - boolean preferDeltaEncoding) { ->>>>>>> upstream/main - this.samplingRatio = samplingRatio; - this.samplePower = samplePower; - this.allowSharedDictionary = allowSharedDictionary; - this.transposeInput = transposeInput; - this.seed = seed == -1 ? (int) System.nanoTime() : seed; - this.validCompressions = validCompressions; - this.lossy = lossy; - this.sortTuplesByFrequency = sortValuesByLength; - this.columnPartitioner = columnPartitioner; - this.maxColGroupCoCode = maxColGroupCoCode; - this.coCodePercentage = coCodePercentage; - this.minimumSampleSize = minimumSampleSize; - this.maxSampleSize = maxSampleSize; - this.estimationType = estimationType; - this.costComputationType = costComputationType; - this.minimumCompressionRatio = minimumCompressionRatio; - this.isInSparkInstruction = isInSparkInstruction; - this.sdcSortType = sdcSortType; - this.scaleFactors = scaleFactors; - this.preferDeltaEncoding = preferDeltaEncoding; - - if(!printedStatus && LOG.isDebugEnabled()) { - printedStatus = true; - LOG.debug(this.toString()); - } - } - - public boolean isRLEAllowed() { - return this.validCompressions.contains(CompressionType.RLE); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("CompressionSettings: "); - sb.append("\t Valid Compressions: " + validCompressions); - sb.append("\t Share dict: " + allowSharedDictionary); - sb.append("\t Partitioner: " + columnPartitioner); - sb.append("\t Lossy: " + lossy); - sb.append("\t Cost Computation Type: " + costComputationType); - if(samplingRatio < 1.0) - sb.append("\t Estimation Type: " + estimationType); - return sb.toString(); - } - - -} + + protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, + boolean preferDeltaEncoding) { + this.samplingRatio = samplingRatio; + this.samplePower = samplePower; + this.allowSharedDictionary = allowSharedDictionary; + this.transposeInput = transposeInput; + this.seed = seed == -1 ? (int) System.nanoTime() : seed; + this.validCompressions = validCompressions; + this.lossy = lossy; + this.sortTuplesByFrequency = sortValuesByLength; + this.columnPartitioner = columnPartitioner; + this.maxColGroupCoCode = maxColGroupCoCode; + this.coCodePercentage = coCodePercentage; + this.minimumSampleSize = minimumSampleSize; + this.maxSampleSize = maxSampleSize; + this.estimationType = estimationType; + this.costComputationType = costComputationType; + this.minimumCompressionRatio = minimumCompressionRatio; + this.isInSparkInstruction = isInSparkInstruction; + this.sdcSortType = sdcSortType; + this.scaleFactors = scaleFactors; + this.preferDeltaEncoding = preferDeltaEncoding; + + if(!printedStatus && LOG.isDebugEnabled()) { + printedStatus = true; + LOG.debug(this.toString()); + } + } + + public boolean isRLEAllowed() { + return this.validCompressions.contains(CompressionType.RLE); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("CompressionSettings: "); + sb.append("\t Valid Compressions: " + validCompressions); + sb.append("\t Share dict: " + allowSharedDictionary); + sb.append("\t Partitioner: " + columnPartitioner); + sb.append("\t Lossy: " + lossy); + sb.append("\t Cost Computation Type: " + costComputationType); + if(samplingRatio < 1.0) + sb.append("\t Estimation Type: " + estimationType); + return sb.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index df618f44f20..9af1b5aff2e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -34,343 +34,332 @@ * Builder pattern for Compression Settings. See CompressionSettings for details on values. */ public class CompressionSettingsBuilder { - private double samplingRatio; - private double samplePower = 0.65; - private boolean allowSharedDictionary = false; - private String transposeInput; - private int seed = -1; - private boolean lossy = false; - private EnumSet validCompressions; - private boolean sortValuesByLength = true; - private int maxColGroupCoCode = 10000; - private double coCodePercentage = 0.01; - private int minimumSampleSize = 3000; - private int maxSampleSize = 1000000; - private EstimationType estimationType = EstimationType.HassAndStokes; - private PartitionerType columnPartitioner; - private CostType costType; - private double minimumCompressionRatio = 1.0; - private boolean isInSparkInstruction = false; - private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; - private double[] scaleFactors = null; - private boolean preferDeltaEncoding = false; + private double samplingRatio; + private double samplePower = 0.65; + private boolean allowSharedDictionary = false; + private String transposeInput; + private int seed = -1; + private boolean lossy = false; + private EnumSet validCompressions; + private boolean sortValuesByLength = true; + private int maxColGroupCoCode = 10000; + private double coCodePercentage = 0.01; + private int minimumSampleSize = 3000; + private int maxSampleSize = 1000000; + private EstimationType estimationType = EstimationType.HassAndStokes; + private PartitionerType columnPartitioner; + private CostType costType; + private double minimumCompressionRatio = 1.0; + private boolean isInSparkInstruction = false; + private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; + private double[] scaleFactors = null; + private boolean preferDeltaEncoding = false; public CompressionSettingsBuilder() { - DMLConfig conf = ConfigurationManager.getDMLConfig(); - this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); - String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); - for(String comp : validCompressionsString) - validCompressions.add(CompressionType.valueOf(comp)); - samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); - columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); - costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); - transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); - seed = DMLScript.SEED; + DMLConfig conf = ConfigurationManager.getDMLConfig(); + this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); + String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); + for(String comp : validCompressionsString) + validCompressions.add(CompressionType.valueOf(comp)); + samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); + columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); + costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); + transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); + seed = DMLScript.SEED; - } + } - /** - * Sets the scale factors for compression, enabling quantization-fused compression. - * - * @param scaleFactors An array of scale factors applied during compression. - * - If row-wise scaling is used, this should be an array where each value corresponds to a row. - * - If a single scalar is provided, it is applied uniformly to the entire matrix. - * @return The CompressionSettingsBuilder instance with the updated scale factors. - */ - public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { - this.scaleFactors = scaleFactors; - return this; - } - - /** - * Copy the settings from another CompressionSettings Builder, modifies this, not that. - * - * @param that The other CompressionSettingsBuilder to copy settings from. - * @return The modified CompressionSettings in the same object. - */ - public CompressionSettingsBuilder copySettings(CompressionSettings that) { - this.samplingRatio = that.samplingRatio; - this.allowSharedDictionary = that.allowSharedDictionary; - this.transposeInput = that.transposeInput; - this.seed = that.seed; - this.lossy = that.lossy; - this.validCompressions = EnumSet.copyOf(that.validCompressions); - this.sortValuesByLength = that.sortTuplesByFrequency; - this.columnPartitioner = that.columnPartitioner; - this.maxColGroupCoCode = that.maxColGroupCoCode; - this.coCodePercentage = that.coCodePercentage; - this.minimumSampleSize = that.minimumSampleSize; - this.preferDeltaEncoding = that.preferDeltaEncoding; - return this; - } - - /** - * Set the Compression to use Lossy compression. - * - * @param lossy A boolean specifying if the compression should be lossy - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setLossy(boolean lossy) { - this.lossy = lossy; - return this; - } - - /** - * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 - * - * @param samplingRatio The ratio to sample from the input - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { - this.samplingRatio = samplingRatio; - return this; - } + /** + * Sets the scale factors for compression, enabling quantization-fused compression. + * + * @param scaleFactors An array of scale factors applied during compression. + * - If row-wise scaling is used, this should be an array where each value corresponds to a row. + * - If a single scalar is provided, it is applied uniformly to the entire matrix. + * @return The CompressionSettingsBuilder instance with the updated scale factors. + */ + public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { + this.scaleFactors = scaleFactors; + return this; + } - /** - * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the - * ColGroup. Improving cache efficiency especially for diverse column groups. - * - * @param sortValuesByLength A boolean specifying if the values should be sorted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { - this.sortValuesByLength = sortValuesByLength; - return this; - } + /** + * Copy the settings from another CompressionSettings Builder, modifies this, not that. + * + * @param that The other CompressionSettingsBuilder to copy settings from. + * @return The modified CompressionSettings in the same object. + */ + public CompressionSettingsBuilder copySettings(CompressionSettings that) { + this.samplingRatio = that.samplingRatio; + this.allowSharedDictionary = that.allowSharedDictionary; + this.transposeInput = that.transposeInput; + this.seed = that.seed; + this.lossy = that.lossy; + this.validCompressions = EnumSet.copyOf(that.validCompressions); + this.sortValuesByLength = that.sortTuplesByFrequency; + this.columnPartitioner = that.columnPartitioner; + this.maxColGroupCoCode = that.maxColGroupCoCode; + this.coCodePercentage = that.coCodePercentage; + this.minimumSampleSize = that.minimumSampleSize; + this.preferDeltaEncoding = that.preferDeltaEncoding; + return this; + } - /** - * Allow the Dictionaries to be shared between different column groups. - * - * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { - this.allowSharedDictionary = allowSharedDictionary; - return this; - } + /** + * Set the Compression to use Lossy compression. + * + * @param lossy A boolean specifying if the compression should be lossy + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setLossy(boolean lossy) { + this.lossy = lossy; + return this; + } - /** - * Specify if the input matrix should be transposed before compression. This improves cache efficiency while - * compression the input matrix - * - * @param transposeInput string specifying if the input should be transposed before compression, should be one of - * "auto", "true" or "false" - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setTransposeInput(String transposeInput) { - switch(transposeInput) { - case "auto": - case "true": - case "false": - this.transposeInput = transposeInput; - break; - default: - throw new DMLCompressionException("Invalid transpose technique"); - } - return this; - } + /** + * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 + * + * @param samplingRatio The ratio to sample from the input + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { + this.samplingRatio = samplingRatio; + return this; + } - /** - * Set the seed for the compression operation. - * - * @param seed The seed used in sampling the matrix and general operations in the compression. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSeed(int seed) { - this.seed = seed; - return this; - } + /** + * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the + * ColGroup. Improving cache efficiency especially for diverse column groups. + * + * @param sortValuesByLength A boolean specifying if the values should be sorted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { + this.sortValuesByLength = sortValuesByLength; + return this; + } - /** - * Set the valid compression strategies used for the compression. - * - * @param validCompressions An EnumSet of CompressionTypes to use in the compression - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { - // should always contain Uncompressed as an option. - if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) - validCompressions.add(CompressionType.UNCOMPRESSED); - if(!validCompressions.contains(CompressionType.CONST)) - validCompressions.add(CompressionType.CONST); - if(!validCompressions.contains(CompressionType.EMPTY)) - validCompressions.add(CompressionType.EMPTY); - this.validCompressions = validCompressions; - return this; - } + /** + * Allow the Dictionaries to be shared between different column groups. + * + * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { + this.allowSharedDictionary = allowSharedDictionary; + return this; + } - /** - * Add a single valid compression type to the EnumSet of valid compressions. - * - * @param cp The compression type to add to the valid ones. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder addValidCompression(CompressionType cp) { - this.validCompressions.add(cp); - return this; - } + /** + * Specify if the input matrix should be transposed before compression. This improves cache efficiency while + * compression the input matrix + * + * @param transposeInput string specifying if the input should be transposed before compression, should be one of + * "auto", "true" or "false" + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setTransposeInput(String transposeInput) { + switch(transposeInput) { + case "auto": + case "true": + case "false": + this.transposeInput = transposeInput; + break; + default: + throw new DMLCompressionException("Invalid transpose technique"); + } + return this; + } /** - * Ziel-Gesamtverlust für piecewise lineare Kompression. - * Interpretation: maximal erlaubter globaler MSE pro Wert in der Spalte. - * 0.0 ~ quasi verlustfrei, viele Segmente - * >0 ~ mehr Approximation erlaubt, weniger Segmente + * Set the seed for the compression operation. + * + * @param seed The seed used in sampling the matrix and general operations in the compression. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSeed(int seed) { + this.seed = seed; + return this; + } + /** + * Set the valid compression strategies used for the compression. + * + * @param validCompressions An EnumSet of CompressionTypes to use in the compression + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { + // should always contain Uncompressed as an option. + if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) + validCompressions.add(CompressionType.UNCOMPRESSED); + if(!validCompressions.contains(CompressionType.CONST)) + validCompressions.add(CompressionType.CONST); + if(!validCompressions.contains(CompressionType.EMPTY)) + validCompressions.add(CompressionType.EMPTY); + this.validCompressions = validCompressions; + return this; + } - public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { - this.piecewiseTargetLoss = piecewiseTargetLoss; - }*/ + /** + * Add a single valid compression type to the EnumSet of valid compressions. + * + * @param cp The compression type to add to the valid ones. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder addValidCompression(CompressionType cp) { + this.validCompressions.add(cp); + return this; + } - /** - * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. - * Since this is required for operation of the compression - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder clearValidCompression() { - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); - return this; - } + /** + * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. + * Since this is required for operation of the compression + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder clearValidCompression() { + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); + return this; + } - /** - * Set the type of CoCoding Partitioner type to use for combining columns together. - * - * @param columnPartitioner The Strategy to select from PartitionerType - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { - this.columnPartitioner = columnPartitioner; - return this; - } + /** + * Set the type of CoCoding Partitioner type to use for combining columns together. + * + * @param columnPartitioner The Strategy to select from PartitionerType + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { + this.columnPartitioner = columnPartitioner; + return this; + } - /** - * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with - * higher numbers. - * - * @param maxColGroupCoCode The max selected. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { - this.maxColGroupCoCode = maxColGroupCoCode; - return this; - } + /** + * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with + * higher numbers. + * + * @param maxColGroupCoCode The max selected. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { + this.maxColGroupCoCode = maxColGroupCoCode; + return this; + } - /** - * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that - * higher values results in more coCoding while lower values result in less. - * - * Note that with high coCoding the compression ratio would possibly be lower. - * - * @param coCodePercentage The percentage to set. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { - this.coCodePercentage = coCodePercentage; - return this; - } + /** + * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that + * higher values results in more coCoding while lower values result in less. + * + * Note that with high coCoding the compression ratio would possibly be lower. + * + * @param coCodePercentage The percentage to set. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { + this.coCodePercentage = coCodePercentage; + return this; + } - /** - * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is lower than this minimum bound. - * - * @param minimumSampleSize The minimum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { - this.minimumSampleSize = minimumSampleSize; - return this; - } + /** + * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is lower than this minimum bound. + * + * @param minimumSampleSize The minimum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { + this.minimumSampleSize = minimumSampleSize; + return this; + } - /** - * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is higher than this maximum bound. - * - * @param maxSampleSize The maximum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { - this.maxSampleSize = maxSampleSize; - return this; - } + /** + * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is higher than this maximum bound. + * + * @param maxSampleSize The maximum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { + this.maxSampleSize = maxSampleSize; + return this; + } - /** - * Set the estimation type used for the sampled estimates. - * - * @param estimationType the estimation type in used. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { - this.estimationType = estimationType; - return this; - } + /** + * Set the estimation type used for the sampled estimates. + * + * @param estimationType the estimation type in used. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { + this.estimationType = estimationType; + return this; + } - /** - * Set the cost type used for estimating the cost of column groups default is memory based. - * - * @param costType The Cost type wanted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCostType(CostType costType) { - this.costType = costType; - return this; - } + /** + * Set the cost type used for estimating the cost of column groups default is memory based. + * + * @param costType The Cost type wanted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCostType(CostType costType) { + this.costType = costType; + return this; + } - /** - * Set the minimum compression ratio to be achieved by the compression. - * - * @param ratio The ratio to achieve while compressing - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { - this.minimumCompressionRatio = ratio; - return this; - } + /** + * Set the minimum compression ratio to be achieved by the compression. + * + * @param ratio The ratio to achieve while compressing + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { + this.minimumCompressionRatio = ratio; + return this; + } - /** - * Inform the compression that it is executed in a spark instruction. - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setIsInSparkInstruction() { - this.isInSparkInstruction = true; - return this; - } + /** + * Inform the compression that it is executed in a spark instruction. + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setIsInSparkInstruction() { + this.isInSparkInstruction = true; + return this; + } - /** - * Set the sort type to use. - * - * @param sdcSortType The sort type for the construction of SDC groups - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { - this.sdcSortType = sdcSortType; - return this; - } + /** + * Set the sort type to use. + * + * @param sdcSortType The sort type for the construction of SDC groups + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { + this.sdcSortType = sdcSortType; + return this; + } - /** - * Set whether to prefer delta encoding during compression estimation. - * When enabled, the compression estimator will use delta encoding statistics - * instead of regular encoding statistics. - * - * @param preferDeltaEncoding Whether to prefer delta encoding - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { - this.preferDeltaEncoding = preferDeltaEncoding; - return this; - } + /** + * Set whether to prefer delta encoding during compression estimation. + * When enabled, the compression estimator will use delta encoding statistics + * instead of regular encoding statistics. + * + * @param preferDeltaEncoding Whether to prefer delta encoding + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { + this.preferDeltaEncoding = preferDeltaEncoding; + return this; + } - /** - * Create the CompressionSettings object to use in the compression. - * - * @return The CompressionSettings - */ - public CompressionSettings create() { - return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, - validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, - minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, - sdcSortType, scaleFactors, preferDeltaEncoding); - } -} + /** + * Create the CompressionSettings object to use in the compression. + * + * @return The CompressionSettings + */ + public CompressionSettings create() { + return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, + validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, + minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, + sdcSortType, scaleFactors, preferDeltaEncoding); + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index aa11a0c00a0..49901004ff0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -1080,7 +1080,8 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - List breakpointsList = computeBreakpoints(cs, column); + double targetLoss = 1e-3; + List breakpointsList = computeBreakpoints(cs, column,targetLoss); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren @@ -1117,10 +1118,9 @@ public static double[] getColumn(MatrixBlock in, int colIndex) { } return column; } - public static List computeBreakpoints(CompressionSettings cs, double[] column){ + public static List computeBreakpoints(CompressionSettings cs, double[] column, double targetloss){ int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); // nur lesen, NICHT setzen! - + double targetMSE = targetloss; // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ if (Double.isNaN(targetMSE) || targetMSE <= 0) { double lambda = 5.0; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index e9e4cd1572b..af4b6dad172 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -10,7 +10,7 @@ import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockMCSR; import org.apache.sysds.runtime.functionobjects.Builtin; -import org.apache.sysds.runtime.instructions.cp.CM_COV_Object; +import org.apache.sysds.runtime.instructions.cp.CmCovObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.matrix.operators.CMOperator; @@ -294,10 +294,11 @@ public void computeColSums(double[] c, int nRows) { } @Override - public CM_COV_Object centralMoment(CMOperator op, int nRows) { + public CmCovObject centralMoment(CMOperator op, int nRows) { return null; } + @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { return null; diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5a740624d2d..b41155c0e49 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -3,17 +3,19 @@ import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; +import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; import org.junit.jupiter.api.BeforeEach; import java.util.Arrays; import java.util.List; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeBreakpoints; +import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; /** @@ -37,36 +39,388 @@ void setUp() { @Test public void testComputeBreakpoints_uniformColumn() { - cs.setPiecewiseTargetLoss(1e-3); + //cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0), breaks); // Erwartet: keine Breaks + List breaks = computeBreakpoints(cs, column,1e-3); + assertEquals(Arrays.asList(0,5), breaks); // Erwartet: keine Breaks } @Test public void testComputeBreakpoints_linearIncreasing() { - cs.setPiecewiseTargetLoss(1e-3); + //cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 2), breaks); // Erwartet + List breaks = computeBreakpoints(cs, column,1e-3); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet } @Test public void testComputeBreakpoints_highLoss_uniform() { - cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + //cs.setPiecewiseTargetLoss(1.0); // ← andere Loss double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0), breaks); + List breaks = computeBreakpoints(cs, column,10000.0); + assertEquals(Arrays.asList(0,5), breaks); + } + @Test + public void testComputeBreakpoints_twoSegments() { + // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + var breaks = computeBreakpoints(cs, column, 1e-3); + assertEquals(Arrays.asList(0, 3, 6), breaks); } @Test public void testComputeBreakpoints_noLoss_linear() { - cs.setPiecewiseTargetLoss(0.0); + //cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 1, 2, 3), breaks); // bei 0 Loss alle Breaks + List breaks = computeBreakpoints(cs, column,0.0); + assertEquals(Arrays.asList(0,5), breaks); // bei 0 Loss alle Breaks + } + @Test + public void testComputeBreakpointsLambda_const() { + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Werte + List breaks = computeBreakpointsLambda(column, 5.0); + assertEquals(Arrays.asList(0, 5), breaks); // 0 bis 5 + + breaks = computeBreakpointsLambda(column, 0.01); + assertEquals(Arrays.asList(0, 5), breaks); // auch mit kleinem lambda + } + @Test + public void testComputeBreakpointsLambda_twoSegments() { + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte + + // mit kleinem lambda -> viele Segmente (kostenlos fast) + List breaks = computeBreakpointsLambda(column, 0.01); + assertTrue(breaks.contains(3)); // 3 muss als Grenze enthalten sein + assertEquals(3, breaks.size()); // 0, 3, 6 + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // mit großem lambda -> nur ein Segment + breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + @Test + public void testComputeBreakpointsLambda_jumpWithTrend() { + double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; + + // grobe Segmentanpassung: ein Segment pro „Abschnitt“ + List breaks = computeBreakpointsLambda(column, 0.5); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // nur ein Segment, wenn lambda sehr groß + breaks = computeBreakpointsLambda(column, 100.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_linear() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + + List breaks = computeBreakpointsLambda(column, 1.0); + assertEquals(Arrays.asList(0, 6), breaks); + + // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind + breaks = computeBreakpointsLambda(column, 0.001); + assertTrue(breaks.size() >= 2); + assertTrue(breaks.get(0) == 0); + assertTrue(breaks.get(breaks.size() - 1) == column.length); + } + @Test + public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; + + List breaks = computeBreakpointsLambda(column, 0.001); + assertNotNull(breaks); + assertFalse(breaks.isEmpty()); + assertEquals(0, (int) breaks.get(0)); + assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); + + // Prüfe, dass die Liste sortiert ist + for (int i = 1; i < breaks.size(); i++) { + assertTrue(breaks.get(i) >= breaks.get(i - 1)); + } + } + @Test + public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; + + List breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 5), breaks); + } + @Test + public void testComputeSegmentCost_emptyOrSingle() { + double[] column = {10.0, 20.0, 30.0}; + + // 0 Elemente (leer) + assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); + + // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 + assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); } + @Test + public void testComputeSegmentCost_twoConstantPoints() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + + // Zwei identische Punkte (konstant) → SSE = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_twoDifferentPoints() { + double[] column = {0.0, 2.0, 1.0, 3.0}; + + // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + + // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 + sse = computeSegmentCost(column, 2, 4); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_constantThree() { + double[] column = {0.0, 0.0, 0.0}; + double sse = computeSegmentCost(column, 0, 3); + assertEquals(0.0, sse, 1e-10); + } + @Test + public void testComputeSegmentCost_consistent_with_regression() { + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; // 6 Punkte + + int start = 0, end = 3; + double[] ab = regressSegment(column, start, end); + double slope = ab[0], intercept = ab[1]; + double sse_hand = 0.0; + for (int i = start; i < end; i++) { + double yhat = slope * i + intercept; + double diff = column[i] - yhat; + sse_hand += diff * diff; + } + + double sse = computeSegmentCost(column, start, end); + assertEquals(sse_hand, sse, 1e-10); + } + @Test + public void testComputeTotalSSE_emptyBreaks() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(); // leer → keine Segmente + double total = computeTotalSSE(column, breaks); + + // 0 Segmente → Summe über 0 Segmente = 0 + assertEquals(0.0, total, 1e-10); + } + @Test + public void testComputeTotalSSE_singleSegment_all() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(0, 3); // ein Segment [0,3) + + double total = computeTotalSSE(column, breaks); + double expected = computeSegmentCost(column, 0, 3); + + // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein + assertEquals(expected, total, 1e-10); + } + @Test + public void testComputeTotalSSE_twoSegments() { + // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) + double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; + List breaks = Arrays.asList(0, 3, 6); // zwei Segmente + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 + + // da beide Segmente konstant sind, muss totalSSE = 0 sein + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2, total, 1e-10); + } + @Test + public void testComputeTotalSSE_threeSegments() { + // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; + List breaks = Arrays.asList(0, 3, 5, 7); + + // Segment [0,3): konstant 1.0 → SSE = 0 + double sse1 = computeSegmentCost(column, 0, 3); // 0 + + // Segment [3,5): [2,2] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 5); // 0 + + // Segment [5,7): [3,3] → SSE = 0 + double sse3 = computeSegmentCost(column, 5, 7); // 0 + + double total = computeTotalSSE(column, breaks); + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2 + sse3, total, 1e-10); + } + @Test + public void testComputeTotalSSE_gapStartEnd() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + List breaks = Arrays.asList(2, 5, 8); // Segmente [2,5), [5,8) + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 2, 5); + double sse2 = computeSegmentCost(column, 5, 8); + + // Resultat: Summe der zwei Segmente + assertEquals(sse1 + sse2, total, 1e-10); + + // Die Indizes <2 und >=8 sind nicht Teil der Segmente und fließen nicht in totalSSE ein + } + @Test + public void testComputeTotalSSE_oneSegment_identical() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; + + // Vergleich: SSE des gesamten Segments über [0,5) + double sseTotal = computeSegmentCost(column, 0, 5); + + // Berechnung mit computeTotalSSE und breaks [0,5] + List breaks = Arrays.asList(0, 5); + double total = computeTotalSSE(column, breaks); + + // beide müssen exakt gleich sein + assertEquals(sseTotal, total, 1e-10); + } + @Test + public void testComputeTotalSSE_nonConstant() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = Arrays.asList(0, 2, 5); // [0,2), [2,5) + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 2); + double sse2 = computeSegmentCost(column, 2, 5); + + // Sanity-Check: Ergebnis positiv, Summe der beiden SSE + assertTrue(total >= 0.0); + assertEquals(sse1 + sse2, total, 1e-10); + } + @Test + public void testComputeTotalSSE_edgeCases() { + // Leere Spalte, Segmente [0,0] → kein Segment + double[] columnEmpty = {}; // length 0 + List breaksEmpty = Arrays.asList(0, 0); + assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); + + // Spalte der Länge 1, ein Segment [0,1) + double[] columnOne = {42.0}; + List breaksOne = Arrays.asList(0, 1); + double total = computeTotalSSE(columnOne, breaksOne); + assertEquals(0.0, total, 1e-10); + } + @Test + public void testRegressSegment_empty() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 0, 0); // leer + assertEquals(0.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_singlePoint() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 1, 2); // nur i=1: y=2.0 + + assertEquals(0.0, result[0], 1e-10); // slope = 0 + assertEquals(2.0, result[1], 1e-10); // intercept = Mittelwert + } + @Test + public void testRegressSegment_twoIdentical() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + double[] result = regressSegment(column, 0, 2); // i=0:5, i=1:5 + + // Steigung = 0, y = 5.0 + 0*i + assertEquals(0.0, result[0], 1e-10); + assertEquals(5.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_twoPoints() { + double[] column = {0.0, 2.0}; // (i=0, y=0), (i=1, y=2) + double[] result = regressSegment(column, 0, 2); + + // Gerade durch (0,0) und (1,2) → y = 2*i + 0 + assertEquals(2.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_twoPoints_offset() { + // column[0], column[1], column[2], column[3] → es gibt 4 Werte + double[] column = {1.0, 3.0, 5.0, 7.0}; // z. B. y = 2*x + 1 → bei x=2: y=5, x=3: y=7 + double[] result = regressSegment(column, 2, 4); // Segment [2,4) → i=2,3 + + // Gerade durch (2,5), (3,7): slope = 2, intercept = 1 + assertEquals(2.0, result[0], 1e-10); + assertEquals(1.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_constant() { + double[] column = {3.0, 3.0, 3.0, 3.0}; + double[] result = regressSegment(column, 0, 4); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(3.0, result[1], 1e-10); + } + @Test + public void testRegressSegment_linear() { + double[] column = new double[4]; + double a = 1.5, b = 2.0; + for (int i = 0; i < 4; i++) { + column[i] = a * i + b; + } + + double[] result = regressSegment(column, 0, 4); + + // Exakt: slope = 1.5, intercept = 2.0 + assertEquals(a, result[0], 1e-10); + assertEquals(b, result[1], 1e-10); + } + @Test + public void testRegressSegment_denomZero() { + // fiktiv: ein Segment mit einem Punkt + double[] column = {10.0}; + double[] result = regressSegment(column, 0, 1); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(10.0, result[1], 1e-10); + } + + @Test + public void testCompressPiecewiseLinearFunctional_const() { + // 1. MatrixBlock mit einer konstanten Spalte erzeugen + double[] data = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Zeilen, 1 Spalte + MatrixBlock in = new MatrixBlock(5, 1, false).quickSetMatrix(data, 5); + + // 2. colIndexes für Spalte 0 + IColIndex colIndexes = ColIndexFactory.create(0); + + // 3. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, new CompressionSettings()); + + // 4. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + + // 5. Check Breakpoints: [0, 5] → ein Segment + int[] breakpoints = plGroup.c(); + assertArrayEquals(new int[] {0, 5}, breakpoints); + + // 6. Pro Segment: 1 Segment → ein slope, ein intercept + double[] slopes = plGroup.getSlopes(); + double[] intercepts = plGroup.getIntercepts(); + assertEquals(1, slopes.length); + assertEquals(1, intercepts.length); + + // 7. Für konstante Daten: Steigung ~0, intercept ~1.0 + assertEquals(0.0, slopes[0], 1e-10); + assertEquals(1.0, intercepts[0], 1e-10); // Mittelwert der Spalte + + // 8. Check: colIndexes stimmt + assertEquals(1, plGroup.getColIndex().size()); + assertEquals(0, plGroup.getColIndex().get(0)); + } + + } \ No newline at end of file From d63aae8d89d157df824e21184da7e96fb90b933e Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 01:48:56 +0100 Subject: [PATCH 06/21] fix: Methods and testing --- .../compress/colgroup/ColGroupFactory.java | 8 +- .../ColGroupPiecewiseLinearCompressed.java | 85 ++-- ...ColGroupPiecewiseLinearCompressedTest.java | 405 ++++++++++++++---- 3 files changed, 389 insertions(+), 109 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 49901004ff0..06bf74b423c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -1080,8 +1080,8 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - double targetLoss = 1e-3; - List breakpointsList = computeBreakpoints(cs, column,targetLoss); + double targetLoss = cs.getPiecewiseTargetLoss(); + List breakpointsList = computeBreakpoints(cs, column); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren @@ -1118,9 +1118,9 @@ public static double[] getColumn(MatrixBlock in, int colIndex) { } return column; } - public static List computeBreakpoints(CompressionSettings cs, double[] column, double targetloss){ + public static List computeBreakpoints(CompressionSettings cs, double[] column){ int n = column.length; - double targetMSE = targetloss; + double targetMSE = cs.getPiecewiseTargetLoss(); // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ if (Double.isNaN(targetMSE) || targetMSE <= 0) { double lambda = 5.0; diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index af4b6dad172..71e935643d9 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -41,9 +41,6 @@ public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints } - - - public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { if (breakpoints == null || breakpoints.length < 2) throw new IllegalArgumentException("Need at least one segment"); @@ -68,39 +65,44 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - final int col = colIndexes.get(0); // bei mehreren Spalten: Schleife + // ✅ Vollständige Null-Safety + if (db == null || _colIndexes == null || _colIndexes.size() == 0 || + breakpoints == null || slopes == null || intercepts == null) { + return; + } - // Hole das interne double[] für die Zielspalte(n) - // DenseBlock ist meist row-major, Zugriff per db.values(…) - // Einfachste Variante: Zeilenweise über db.getBlockValues(...) arbeiten. + int numSeg = breakpoints.length - 1; + if (numSeg <= 0 || rl >= ru) { + return; + } - final int numSeg = breakpoints.length - 1; + final int col = _colIndexes.get(0); for (int s = 0; s < numSeg; s++) { - final int segStart = breakpoints[s]; - final int segEnd = breakpoints[s + 1]; - final double a = slopes[s]; - final double b = intercepts[s]; + int segStart = breakpoints[s]; + int segEnd = breakpoints[s + 1]; + if (segStart >= segEnd) continue; // Invalid Segment - // Segment auf angefragten Bereich einschränken - final int rs = Math.max(segStart, rl); - final int re = Math.min(segEnd, ru); - if (rs >= re) - continue; + double a = slopes[s]; + double b = intercepts[s]; - for (int r = rs; r < re; r++) { - double x = r; // selbes x wie beim Fit - double yhat = a * x + b; - - // globale Position im DenseBlock - int gr = r + offR; - int gc = col + offC; + int rs = Math.max(segStart, rl); + int re = Math.min(segEnd, ru); + if (rs >= re) continue; - // db.set(row, col, value) - db.set(gr, gc, yhat); + for (int r = rs; r < re; r++) { + double yhat = a * r + b; + int gr = offR + r; + int gc = offC + col; + + // ✅ Bounds-Check vor set() + if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { + db.set(gr, gc, yhat); + } } } } + @Override protected double computeMxx(double c, Builtin builtin) { return 0; @@ -198,12 +200,26 @@ public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { @Override public double getIdx(int r, int colIdx) { - return 0; + // ✅ CRUCIAL: Bounds-Check für colIdx! + if (r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { + return 0.0; + } + + // Segment-Suche (sicher jetzt) + int seg = 0; + for (int i = 1; i < breakpoints.length; i++) { + if (r < breakpoints[i]) { + break; + } + seg = i - 1; // seg < numSeg immer! + } + + return slopes[seg] * (double) r + intercepts[seg]; } @Override public int getNumValues() { - return 0; + return breakpoints.length + slopes.length + intercepts.length; } @Override @@ -368,5 +384,18 @@ protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, M public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { return new AColGroup[0]; } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + + public double[] getIntercepts() { + return intercepts; + } } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index b41155c0e49..c0ca62ce9d5 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -8,6 +8,7 @@ import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; import org.junit.jupiter.api.BeforeEach; @@ -18,87 +19,83 @@ import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; -/** - * Tests für PiecewiseLinearColGroupCompressed, fokussiert auf: - * - Konstruktor / create(...) - * - decompressToDenseBlock(...) - */ -//TODO Fix -public class ColGroupPiecewiseLinearCompressedTest { - - private CompressionSettings cs; - // ------------------------------------------------------------- - // 1. create(...) und Konstruktor - // ------------------------------------------------------------- - @BeforeEach - void setUp() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); +public class ColGroupPiecewiseLinearCompressedTest { - } @Test public void testComputeBreakpoints_uniformColumn() { - //cs.setPiecewiseTargetLoss(1e-3); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column,1e-3); - assertEquals(Arrays.asList(0,5), breaks); // Erwartet: keine Breaks + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks } @Test public void testComputeBreakpoints_linearIncreasing() { - //cs.setPiecewiseTargetLoss(1e-3); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column,1e-3); + List breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 5), breaks); // Erwartet } @Test public void testComputeBreakpoints_highLoss_uniform() { - //cs.setPiecewiseTargetLoss(1.0); // ← andere Loss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10000.0); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column,10000.0); - assertEquals(Arrays.asList(0,5), breaks); + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeBreakpoints_twoSegments() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; - var breaks = computeBreakpoints(cs, column, 1e-3); + var breaks = computeBreakpoints(cs, column); assertEquals(Arrays.asList(0, 3, 6), breaks); } @Test public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(0.0); //cs.setPiecewiseTargetLoss(0.0); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column,0.0); - assertEquals(Arrays.asList(0,5), breaks); // bei 0 Loss alle Breaks + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks } + @Test public void testComputeBreakpointsLambda_const() { - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Werte + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpointsLambda(column, 5.0); - assertEquals(Arrays.asList(0, 5), breaks); // 0 bis 5 + assertEquals(Arrays.asList(0, 5), breaks); breaks = computeBreakpointsLambda(column, 0.01); - assertEquals(Arrays.asList(0, 5), breaks); // auch mit kleinem lambda + assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeBreakpointsLambda_twoSegments() { double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte // mit kleinem lambda -> viele Segmente (kostenlos fast) List breaks = computeBreakpointsLambda(column, 0.01); - assertTrue(breaks.contains(3)); // 3 muss als Grenze enthalten sein - assertEquals(3, breaks.size()); // 0, 3, 6 + assertTrue(breaks.contains(3)); + assertEquals(3, breaks.size()); assertEquals(Arrays.asList(0, 3, 6), breaks); - // mit großem lambda -> nur ein Segment + // mit großem lambda entspricht nur ein Segment breaks = computeBreakpointsLambda(column, 1000.0); assertEquals(Arrays.asList(0, 6), breaks); } + @Test public void testComputeBreakpointsLambda_jumpWithTrend() { double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; @@ -125,6 +122,7 @@ public void testComputeBreakpointsLambda_linear() { assertTrue(breaks.get(0) == 0); assertTrue(breaks.get(breaks.size() - 1) == column.length); } + @Test public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; @@ -140,6 +138,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { assertTrue(breaks.get(i) >= breaks.get(i - 1)); } } + @Test public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; @@ -147,6 +146,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { List breaks = computeBreakpointsLambda(column, 1000.0); assertEquals(Arrays.asList(0, 5), breaks); } + @Test public void testComputeSegmentCost_emptyOrSingle() { double[] column = {10.0, 20.0, 30.0}; @@ -160,6 +160,7 @@ public void testComputeSegmentCost_emptyOrSingle() { assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); } + @Test public void testComputeSegmentCost_twoConstantPoints() { double[] column = {5.0, 5.0, 1.0, 1.0}; @@ -168,6 +169,7 @@ public void testComputeSegmentCost_twoConstantPoints() { double sse = computeSegmentCost(column, 0, 2); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_twoDifferentPoints() { double[] column = {0.0, 2.0, 1.0, 3.0}; @@ -180,15 +182,17 @@ public void testComputeSegmentCost_twoDifferentPoints() { sse = computeSegmentCost(column, 2, 4); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_constantThree() { double[] column = {0.0, 0.0, 0.0}; double sse = computeSegmentCost(column, 0, 3); assertEquals(0.0, sse, 1e-10); } + @Test public void testComputeSegmentCost_consistent_with_regression() { - double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; // 6 Punkte + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; int start = 0, end = 3; double[] ab = regressSegment(column, start, end); @@ -203,6 +207,7 @@ public void testComputeSegmentCost_consistent_with_regression() { double sse = computeSegmentCost(column, start, end); assertEquals(sse_hand, sse, 1e-10); } + @Test public void testComputeTotalSSE_emptyBreaks() { double[] column = {1.0, 2.0, 3.0}; @@ -212,6 +217,7 @@ public void testComputeTotalSSE_emptyBreaks() { // 0 Segmente → Summe über 0 Segmente = 0 assertEquals(0.0, total, 1e-10); } + @Test public void testComputeTotalSSE_singleSegment_all() { double[] column = {1.0, 2.0, 3.0}; @@ -223,6 +229,7 @@ public void testComputeTotalSSE_singleSegment_all() { // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein assertEquals(expected, total, 1e-10); } + @Test public void testComputeTotalSSE_twoSegments() { // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) @@ -237,6 +244,7 @@ public void testComputeTotalSSE_twoSegments() { assertEquals(0.0, total, 1e-10); assertEquals(sse1 + sse2, total, 1e-10); } + @Test public void testComputeTotalSSE_threeSegments() { // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten @@ -256,103 +264,101 @@ public void testComputeTotalSSE_threeSegments() { assertEquals(0.0, total, 1e-10); assertEquals(sse1 + sse2 + sse3, total, 1e-10); } + @Test public void testComputeTotalSSE_gapStartEnd() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - List breaks = Arrays.asList(2, 5, 8); // Segmente [2,5), [5,8) + List breaks = Arrays.asList(2, 5, 8); double total = computeTotalSSE(column, breaks); double sse1 = computeSegmentCost(column, 2, 5); double sse2 = computeSegmentCost(column, 5, 8); - // Resultat: Summe der zwei Segmente assertEquals(sse1 + sse2, total, 1e-10); - // Die Indizes <2 und >=8 sind nicht Teil der Segmente und fließen nicht in totalSSE ein } + @Test public void testComputeTotalSSE_oneSegment_identical() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; - - // Vergleich: SSE des gesamten Segments über [0,5) double sseTotal = computeSegmentCost(column, 0, 5); - // Berechnung mit computeTotalSSE und breaks [0,5] List breaks = Arrays.asList(0, 5); double total = computeTotalSSE(column, breaks); - // beide müssen exakt gleich sein assertEquals(sseTotal, total, 1e-10); } + @Test public void testComputeTotalSSE_nonConstant() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = Arrays.asList(0, 2, 5); // [0,2), [2,5) + List breaks = Arrays.asList(0, 2, 5); double total = computeTotalSSE(column, breaks); double sse1 = computeSegmentCost(column, 0, 2); double sse2 = computeSegmentCost(column, 2, 5); - // Sanity-Check: Ergebnis positiv, Summe der beiden SSE assertTrue(total >= 0.0); assertEquals(sse1 + sse2, total, 1e-10); } + @Test public void testComputeTotalSSE_edgeCases() { - // Leere Spalte, Segmente [0,0] → kein Segment - double[] columnEmpty = {}; // length 0 + double[] columnEmpty = {}; List breaksEmpty = Arrays.asList(0, 0); assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); - // Spalte der Länge 1, ein Segment [0,1) double[] columnOne = {42.0}; List breaksOne = Arrays.asList(0, 1); double total = computeTotalSSE(columnOne, breaksOne); assertEquals(0.0, total, 1e-10); } + @Test public void testRegressSegment_empty() { double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 0, 0); // leer + double[] result = regressSegment(column, 0, 0); assertEquals(0.0, result[0], 1e-10); assertEquals(0.0, result[1], 1e-10); } + @Test public void testRegressSegment_singlePoint() { double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 1, 2); // nur i=1: y=2.0 + double[] result = regressSegment(column, 1, 2); - assertEquals(0.0, result[0], 1e-10); // slope = 0 - assertEquals(2.0, result[1], 1e-10); // intercept = Mittelwert + assertEquals(0.0, result[0], 1e-10); + assertEquals(2.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoIdentical() { double[] column = {5.0, 5.0, 1.0, 1.0}; - double[] result = regressSegment(column, 0, 2); // i=0:5, i=1:5 + double[] result = regressSegment(column, 0, 2); - // Steigung = 0, y = 5.0 + 0*i assertEquals(0.0, result[0], 1e-10); assertEquals(5.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoPoints() { - double[] column = {0.0, 2.0}; // (i=0, y=0), (i=1, y=2) + double[] column = {0.0, 2.0}; double[] result = regressSegment(column, 0, 2); - // Gerade durch (0,0) und (1,2) → y = 2*i + 0 assertEquals(2.0, result[0], 1e-10); assertEquals(0.0, result[1], 1e-10); } + @Test public void testRegressSegment_twoPoints_offset() { - // column[0], column[1], column[2], column[3] → es gibt 4 Werte - double[] column = {1.0, 3.0, 5.0, 7.0}; // z. B. y = 2*x + 1 → bei x=2: y=5, x=3: y=7 - double[] result = regressSegment(column, 2, 4); // Segment [2,4) → i=2,3 - // Gerade durch (2,5), (3,7): slope = 2, intercept = 1 + double[] column = {1.0, 3.0, 5.0, 7.0}; + double[] result = regressSegment(column, 2, 4); + assertEquals(2.0, result[0], 1e-10); assertEquals(1.0, result[1], 1e-10); } + @Test public void testRegressSegment_constant() { double[] column = {3.0, 3.0, 3.0, 3.0}; @@ -361,6 +367,7 @@ public void testRegressSegment_constant() { assertEquals(0.0, result[0], 1e-10); assertEquals(3.0, result[1], 1e-10); } + @Test public void testRegressSegment_linear() { double[] column = new double[4]; @@ -371,13 +378,12 @@ public void testRegressSegment_linear() { double[] result = regressSegment(column, 0, 4); - // Exakt: slope = 1.5, intercept = 2.0 assertEquals(a, result[0], 1e-10); assertEquals(b, result[1], 1e-10); } + @Test public void testRegressSegment_denomZero() { - // fiktiv: ein Segment mit einem Punkt double[] column = {10.0}; double[] result = regressSegment(column, 0, 1); @@ -388,38 +394,283 @@ public void testRegressSegment_denomZero() { @Test public void testCompressPiecewiseLinearFunctional_const() { // 1. MatrixBlock mit einer konstanten Spalte erzeugen - double[] data = {1.0, 1.0, 1.0, 1.0, 1.0}; // 5 Zeilen, 1 Spalte - MatrixBlock in = new MatrixBlock(5, 1, false).quickSetMatrix(data, 5); - + int nrows = 20, ncols = 1; + MatrixBlock in = new MatrixBlock(nrows, ncols, false); + for (int r = 0; r < nrows; r++) + in.set(r, 0, 1.0); // 2. colIndexes für Spalte 0 - IColIndex colIndexes = ColIndexFactory.create(0); - - // 3. Aufruf der Kompressionsfunktion - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, new CompressionSettings()); + IColIndex colIndexes = ColIndexFactory.create(new int[]{0}); + // 3. CompressionSettings mit TargetLoss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + // 4. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); - // 4. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - // 5. Check Breakpoints: [0, 5] → ein Segment - int[] breakpoints = plGroup.c(); - assertArrayEquals(new int[] {0, 5}, breakpoints); + // 6. Breakpoints per Getter, nicht per create() + int[] breakpoints = plGroup.getBreakpoints(); + assertArrayEquals(new int[]{0, 20}, breakpoints); - // 6. Pro Segment: 1 Segment → ein slope, ein intercept + // 7. Pro Segment: 1 Segment → ein slope, ein intercept double[] slopes = plGroup.getSlopes(); double[] intercepts = plGroup.getIntercepts(); assertEquals(1, slopes.length); assertEquals(1, intercepts.length); - // 7. Für konstante Daten: Steigung ~0, intercept ~1.0 + // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 assertEquals(0.0, slopes[0], 1e-10); - assertEquals(1.0, intercepts[0], 1e-10); // Mittelwert der Spalte + assertEquals(1.0, intercepts[0], 1e-10); + + // 9. Check: colIndexes stimmt + IColIndex idx = plGroup.getColIndices(); + assertEquals(1, idx.size()); + assertEquals(0, idx.get(0)); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_nullBreakpoints() { + int[] nullBp = null; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), nullBp, new double[]{1.0}, new double[]{0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_tooFewBreakpoints() { + int[] singleBp = {0}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), singleBp, new double[]{1.0}, new double[]{0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0, 3.0}, + new double[]{0.0, 1.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0}, + new double[]{0.0}, 10); + } + + @Test + public void testCreate_validMultiSegment() { + int[] bp = {0, 3, 7, 10}; + double[] slopes = {1.0, -2.0, 0.5}; + double[] intercepts = {0.0, 5.0, -1.0}; + IColIndex cols = ColIndexFactory.create(new int[]{0, 1}); + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); + } + + @Test + public void testCreate_multiColumn() { + IColIndex cols = ColIndexFactory.create(new int[]{5, 10, 15}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; - // 8. Check: colIndexes stimmt - assertEquals(1, plGroup.getColIndex().size()); - assertEquals(0, plGroup.getColIndex().get(0)); + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // + assertTrue(cg.getNumValues() > 0); + + for (int r = 0; r < 5; r++) { + double expected = 3.0 * r + 2.0; + // colIdx=0 → globale Spalte 5 + assertEquals(expected, cg.getIdx(r, 0), 1e-9); + // colIdx=1 → globale Spalte 10 + assertEquals(expected, cg.getIdx(r, 1), 1e-9); + // colIdx=2 → globale Spalte 15 + assertEquals(expected, cg.getIdx(r, 2), 1e-9); + } + + for (int r = 5; r < 10; r++) { + double expected = 3.0 * r + 2.0; + assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich + } + assertEquals(cols.size(), 3); } + @Test + public void testCreate_singleColumn() { + IColIndex cols = ColIndexFactory.create(new int[]{5}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 + assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 + } + + @Test + public void testCreate_validMinimal() { + + // 1 Segment: [0,10] → y = 2.0 * r + 1.0 + int[] bp = {0, 10}; + double[] slopes = {2.0}; + double[] intercepts = {1.0}; + IColIndex cols = ColIndexFactory.create(new int[]{0}); + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + // Korrekte Instanz + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // getNumValues() > 0 + assertTrue(cg.getNumValues() > 0); + + // r < numRows + for (int r = 0; r < numRows; r++) { + double expected = 2.0 * r + 1.0; + assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); + } + + // Letzte gültige Row + assertEquals(19.0, cg.getIdx(9, 0), 1e-9); + + //Out-of-Bounds korrekt 0.0 + assertEquals(0.0, cg.getIdx(10, 0), 1e-9); + assertEquals(0.0, cg.getIdx(9, 1), 1e-9); + } + + @Test + public void testDecompressToDenseBlock() { + int[] bp = {0, 5, 10}; + double[] slopes = {1.0, 2.0}; + double[] intercepts = {0.0, 1.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); + + // 1. MatrixBlock mit korrekten Dimensionen + MatrixBlock target = new MatrixBlock(numRows, 1, false); + + // 2. DenseBlock ZUERST alloziieren! + target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); + + // 3. Jetzt DenseBlock verfügbar + DenseBlock db = target.getDenseBlock(); + assertNotNull(db); // Sicherstellen! + + // 4. Dekomprimieren + cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + + // 5. Prüfen + for (int r = 0; r < numRows; r++) { + double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; + assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); + } + } + + private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { + int[] bp = {0, 5, numRows}; + double[] slopes = {1.0, 3.0}; + double[] intercepts = {0.0, 2.0}; + return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); + } + + @Test + public void testDecompressToDenseBlock_fullRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 0, 12, 0, 0); + + // Segment 0 [0,5): y = r + assertEquals(0.0, db.get(0, 0), 1e-9); + assertEquals(4.0, db.get(4, 0), 1e-9); + + assertEquals(17.0, db.get(5, 0), 1e-9); + assertEquals(29.0, db.get(9, 0), 1e-9); + assertEquals(32.0, db.get(10, 0), 1e-9); + assertEquals(35.0, db.get(11, 0), 1e-9); + } + + + + @Test + public void testDecompressToDenseBlock_partialRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // rl=6, ru=9 → r=6,7,8 dekomprimieren + // offR=0 → schreibt in Target-Rows 6,7,8 + cg.decompressToDenseBlock(db, 6, 9, 0, 0); + + + assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) + assertEquals(20.0, db.get(6, 0), 1e-9); + assertEquals(23.0, db.get(7, 0), 1e-9); + assertEquals(26.0, db.get(8, 0), 1e-9); + assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) + } + + + @Test + public void testDecompressToDenseBlock_emptyRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(5, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // Leerer Bereich + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru + cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru + + // Alles bleibt 0.0 + for (int r = 0; r < 5; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } + + @Test + public void testDecompressToDenseBlock_nullSafety() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + + // Null DenseBlock + cg.decompressToDenseBlock(null, 0, 10, 0, 0); + + // Ungültige Parameter (leerer Bereich) + MatrixBlock target = new MatrixBlock(10, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru + cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru + + // Target unverändert + for (int r = 0; r < 10; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } From 78460b51a08574557089866bc80c51488950c8da Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 01:52:21 +0100 Subject: [PATCH 07/21] wip: decompressing --- .../colgroup/scheme/ColGroupPiecewiseLinearCompressed.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index 71e935643d9..4062c4da611 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -65,7 +65,7 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - // ✅ Vollständige Null-Safety + if (db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || intercepts == null) { return; @@ -95,7 +95,7 @@ public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int int gr = offR + r; int gc = offC + col; - // ✅ Bounds-Check vor set() + if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { db.set(gr, gc, yhat); } From f42b766d693e95ecff41faaff16f1192deedd738 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:14:57 +0100 Subject: [PATCH 08/21] add: Enum Compressiontype piecewiselinear --- .../org/apache/sysds/runtime/compress/colgroup/AColGroup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index 003703f86a4..e2bf69f5c15 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -65,7 +65,7 @@ public abstract class AColGroup implements Serializable { /** Public super types of compression ColGroups supported */ public static enum CompressionType { - UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional; + UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR; From 47256c0a14ce2d2b8a0fb0c068bdc82ea039b4a5 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:16:10 +0100 Subject: [PATCH 09/21] add: include functionality of piecewise linear compression --- .../sysds/runtime/compress/colgroup/ColGroupFactory.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 06bf74b423c..c5de46b161c 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -304,6 +304,11 @@ else if(ct == CompressionType.LinearFunctional) { return compressLinearFunctional(colIndexes, in, cs); } } + else if(ct == CompressionType.PiecewiseLinear) { + + return compressPiecewiseLinearFunctional(colIndexes, in, cs); + + } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); if(g instanceof ColGroupDDC) @@ -1080,7 +1085,6 @@ public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, // Breakpoints bestimmen: Einteilung der Segmente - double targetLoss = cs.getPiecewiseTargetLoss(); List breakpointsList = computeBreakpoints(cs, column); int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); //Für jedes Segment lineare Regression als kompressionsverfahren From 505c0ccfeaf011650363382a45d69f889fd4fa9b Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:47:16 +0100 Subject: [PATCH 10/21] add: Comment --- .../org/apache/sysds/runtime/compress/colgroup/AColGroup.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index e2bf69f5c15..32a4053c95b 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -64,7 +64,8 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - public static enum CompressionType { + // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? + public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { From 103abd87c1ef55eedc4f6980da7285539ec87de1 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:49:15 +0100 Subject: [PATCH 11/21] add: dispatch test and remove unused imports --- ...ColGroupPiecewiseLinearCompressedTest.java | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index c0ca62ce9d5..5b3688be5b1 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -2,20 +2,17 @@ import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; -import org.apache.sysds.runtime.compress.colgroup.indexes.ArrayIndex; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; -import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; - +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; +import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; +import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; -import org.junit.jupiter.api.BeforeEach; - import java.util.Arrays; import java.util.List; - import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; @@ -671,7 +668,39 @@ public void testDecompressToDenseBlock_nullSafety() { assertEquals(0.0, db.get(r, 0), 1e-9); } } + private CompressedSizeInfo createTestCompressedSizeInfo() { + IColIndex cols = ColIndexFactory.create(new int[]{0}); + EstimationFactors facts = new EstimationFactors(2, 10); + + CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup( + cols, facts, AColGroup.CompressionType.PiecewiseLinear); + + List infos = Arrays.asList(info); + CompressedSizeInfo csi = new CompressedSizeInfo(infos); + + return csi; + } + @Test + public void testCompressPiecewiseLinear_viaRealAPI() { + + MatrixBlock in = new MatrixBlock(10, 1, false); + in.allocateDenseBlock(); + for (int r = 0; r < 10; r++) { + in.set(r, 0, r * 0.5); + } + CompressionSettings cs = new CompressionSettingsBuilder() + .addValidCompression(AColGroup.CompressionType.PiecewiseLinear) + .create(); + + CompressedSizeInfo csi = createTestCompressedSizeInfo(); + + List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); + + boolean hasPiecewise = colGroups.stream() + .anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(hasPiecewise); + } } \ No newline at end of file From 31b957de3ea941e437b51527d8afaa7830e35a8c Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 29 Jan 2026 19:50:40 +0100 Subject: [PATCH 12/21] fix: reformat code mit Eclipse XML Profile --- .../runtime/compress/CompressionSettings.java | 333 ++-- .../runtime/compress/colgroup/AColGroup.java | 235 +-- .../compress/colgroup/ColGroupFactory.java | 374 +++-- .../ColGroupPiecewiseLinearCompressed.java | 623 ++++---- ...ColGroupPiecewiseLinearCompressedTest.java | 1335 ++++++++--------- 5 files changed, 1440 insertions(+), 1460 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index d1f97928975..7d5a1dac51a 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -34,174 +34,171 @@ * CompressionSettingsBuilder for default non static parameters. */ public class CompressionSettings { - private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); + private static final Log LOG = LogFactory.getLog(CompressionSettings.class.getName()); - /** Parallelization threshold for DDC compression */ - public static int PAR_DDC_THRESHOLD = 10000; + /** Parallelization threshold for DDC compression */ + public static int PAR_DDC_THRESHOLD = 10000; + + /** + * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not + * Character max value + 1 because it breaks the offsets in cases with fully dense values. + */ + public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; - /** - * Size of the blocks used in a blocked bitmap representation. Note it is exactly Character.MAX_VALUE. This is not - * Character max value + 1 because it breaks the offsets in cases with fully dense values. - */ - public static final int BITMAP_BLOCK_SZ = Character.MAX_VALUE; + /** + * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease + * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + + * garbage collection increases) + */ + public final boolean sortTuplesByFrequency; - /** - * Sorting of values by physical length helps by 10-20%, especially for serial, while slight performance decrease - * for parallel incl multi-threaded, hence not applied for distributed operations (also because compression time + - * garbage collection increases) - */ - public final boolean sortTuplesByFrequency; - - /** - * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the - * number of elements is below 1000. - * - * DEPRECATED - */ - public final double samplingRatio; - - /** - * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: - * - * sampleSize += nRows^samplePower; - * - * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. - */ - public final double samplePower; - - /** Share DDC Dictionaries between ColGroups. */ - public final boolean allowSharedDictionary; - - /** Boolean specifying which transpose setting is used, can be auto, true or false */ - public final String transposeInput; - - /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ - public final int seed; - - /** True if lossy compression is enabled */ - public final boolean lossy; - - /** The selected method for column partitioning used in CoCoding compressed columns */ - public final PartitionerType columnPartitioner; - - /** The cost computation type for the compression */ - public final CostType costComputationType; - - /** The maximum number of columns CoCoded allowed */ - public final int maxColGroupCoCode; - - /** - * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects - * aggressively likely coCoding is used. - */ - public final double coCodePercentage; - - /** - * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression - * Default is to always allow for Uncompromisable ColGroup. - */ - public final EnumSet validCompressions; - - /** The minimum size of the sample extracted. */ - public final int minimumSampleSize; - - /** The maximum size of the sample extracted. */ - public final int maxSampleSize; - - /** The sample type used for sampling */ - public final EstimationType estimationType; - - /** - * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script - * based on the transposeInput setting. - * - * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase 3. - */ - public boolean transposed = false; - - /** The minimum compression ratio to achieve. */ - public final double minimumCompressionRatio; - - - - /** Is a spark instruction */ - public final boolean isInSparkInstruction; - - /** The sorting type used in sorting/joining offsets to create SDC groups */ - public final SORT_TYPE sdcSortType; - - /** if the settings have been logged already. */ - public static boolean printedStatus = false; - - public final double[] scaleFactors; - - public final boolean preferDeltaEncoding; - - /** - * Ziel-Gesantverlust für piecewise Lineace Komocession• - * Interpretation: maximal entaubter Alobaler MSE pro Went in der Sealte. - * O.O ~ quasi verlustfrei, viele Segmente - * >0 - ~ mehr Approximation entaubt, weniger Segmente - */ - private double piecewiseTargetLoss = Double.NaN; - public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { - this.piecewiseTargetLoss = piecewiseTargetLoss; - - } - public double getPiecewiseTargetLoss() { - return piecewiseTargetLoss; - } - - - protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, - String transposeInput, int seed, boolean lossy, EnumSet validCompressions, - boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, - int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, - double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, - boolean preferDeltaEncoding) { - this.samplingRatio = samplingRatio; - this.samplePower = samplePower; - this.allowSharedDictionary = allowSharedDictionary; - this.transposeInput = transposeInput; - this.seed = seed == -1 ? (int) System.nanoTime() : seed; - this.validCompressions = validCompressions; - this.lossy = lossy; - this.sortTuplesByFrequency = sortValuesByLength; - this.columnPartitioner = columnPartitioner; - this.maxColGroupCoCode = maxColGroupCoCode; - this.coCodePercentage = coCodePercentage; - this.minimumSampleSize = minimumSampleSize; - this.maxSampleSize = maxSampleSize; - this.estimationType = estimationType; - this.costComputationType = costComputationType; - this.minimumCompressionRatio = minimumCompressionRatio; - this.isInSparkInstruction = isInSparkInstruction; - this.sdcSortType = sdcSortType; - this.scaleFactors = scaleFactors; - this.preferDeltaEncoding = preferDeltaEncoding; - - if(!printedStatus && LOG.isDebugEnabled()) { - printedStatus = true; - LOG.debug(this.toString()); - } - } - - public boolean isRLEAllowed() { - return this.validCompressions.contains(CompressionType.RLE); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("CompressionSettings: "); - sb.append("\t Valid Compressions: " + validCompressions); - sb.append("\t Share dict: " + allowSharedDictionary); - sb.append("\t Partitioner: " + columnPartitioner); - sb.append("\t Lossy: " + lossy); - sb.append("\t Cost Computation Type: " + costComputationType); - if(samplingRatio < 1.0) - sb.append("\t Estimation Type: " + estimationType); - return sb.toString(); - } -} \ No newline at end of file + /** + * The sampling ratio used when choosing ColGroups. Note that, default behavior is to use exact estimator if the + * number of elements is below 1000. + * + * DEPRECATED + */ + public final double samplingRatio; + + /** + * The sampling ratio power to use when choosing sample size. This is used in accordance to the function: + * + * sampleSize += nRows^samplePower; + * + * The value is bounded to be in the range of 0 to 1, 1 giving a sample size of everything, and 0 adding 1. + */ + public final double samplePower; + + /** Share DDC Dictionaries between ColGroups. */ + public final boolean allowSharedDictionary; + + /** Boolean specifying which transpose setting is used, can be auto, true or false */ + public final String transposeInput; + + /** If the seed is -1 then the system used system millisecond time and class hash for seeding. */ + public final int seed; + + /** True if lossy compression is enabled */ + public final boolean lossy; + + /** The selected method for column partitioning used in CoCoding compressed columns */ + public final PartitionerType columnPartitioner; + + /** The cost computation type for the compression */ + public final CostType costComputationType; + + /** The maximum number of columns CoCoded allowed */ + public final int maxColGroupCoCode; + + /** + * A Cocode parameter that differ in behavior based on compression method, in general it is a value that reflects + * aggressively likely coCoding is used. + */ + public final double coCodePercentage; + + /** + * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression + * Default is to always allow for Uncompromisable ColGroup. + */ + public final EnumSet validCompressions; + + /** The minimum size of the sample extracted. */ + public final int minimumSampleSize; + + /** The maximum size of the sample extracted. */ + public final int maxSampleSize; + + /** The sample type used for sampling */ + public final EstimationType estimationType; + + /** + * Transpose input matrix, to optimize access when extracting bitmaps. This setting is changed inside the script + * based on the transposeInput setting. + * + * This is intentionally left as a mutable value, since the transposition of the input matrix is decided in phase + * 3. + */ + public boolean transposed = false; + + /** The minimum compression ratio to achieve. */ + public final double minimumCompressionRatio; + + /** Is a spark instruction */ + public final boolean isInSparkInstruction; + + /** The sorting type used in sorting/joining offsets to create SDC groups */ + public final SORT_TYPE sdcSortType; + + /** if the settings have been logged already. */ + public static boolean printedStatus = false; + + public final double[] scaleFactors; + + public final boolean preferDeltaEncoding; + + /** + * Ziel-Gesantverlust für piecewise Lineace Komocession• Interpretation: maximal entaubter Alobaler MSE pro Went in + * der Sealte. O.O ~ quasi verlustfrei, viele Segmente >0 ~ mehr Approximation entaubt, weniger Segmente + */ + private double piecewiseTargetLoss = Double.NaN; + + public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { + this.piecewiseTargetLoss = piecewiseTargetLoss; + + } + + public double getPiecewiseTargetLoss() { + return piecewiseTargetLoss; + } + + protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary, + String transposeInput, int seed, boolean lossy, EnumSet validCompressions, + boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage, + int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType, + double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors, + boolean preferDeltaEncoding) { + this.samplingRatio = samplingRatio; + this.samplePower = samplePower; + this.allowSharedDictionary = allowSharedDictionary; + this.transposeInput = transposeInput; + this.seed = seed == -1 ? (int) System.nanoTime() : seed; + this.validCompressions = validCompressions; + this.lossy = lossy; + this.sortTuplesByFrequency = sortValuesByLength; + this.columnPartitioner = columnPartitioner; + this.maxColGroupCoCode = maxColGroupCoCode; + this.coCodePercentage = coCodePercentage; + this.minimumSampleSize = minimumSampleSize; + this.maxSampleSize = maxSampleSize; + this.estimationType = estimationType; + this.costComputationType = costComputationType; + this.minimumCompressionRatio = minimumCompressionRatio; + this.isInSparkInstruction = isInSparkInstruction; + this.sdcSortType = sdcSortType; + this.scaleFactors = scaleFactors; + this.preferDeltaEncoding = preferDeltaEncoding; + + if(!printedStatus && LOG.isDebugEnabled()) { + printedStatus = true; + LOG.debug(this.toString()); + } + } + + public boolean isRLEAllowed() { + return this.validCompressions.contains(CompressionType.RLE); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("CompressionSettings: "); + sb.append("\t Valid Compressions: " + validCompressions); + sb.append("\t Share dict: " + allowSharedDictionary); + sb.append("\t Partitioner: " + columnPartitioner); + sb.append("\t Lossy: " + lossy); + sb.append("\t Cost Computation Type: " + costComputationType); + if(samplingRatio < 1.0) + sb.append("\t Estimation Type: " + estimationType); + return sb.toString(); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index 32a4053c95b..d761af7667a 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -55,7 +55,7 @@ /** * Abstract Class that is the lowest class type for the Compression framework. - * + * * AColGroup store information about a number of columns. * */ @@ -64,8 +64,8 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? - public static enum CompressionType { + // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? + public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; public boolean isDense() { @@ -83,7 +83,7 @@ public boolean isSDC() { /** * Concrete ColGroupType - * + * * Protected such that outside the ColGroup package it should be unknown which specific subtype is used. */ protected static enum ColGroupType { @@ -96,7 +96,7 @@ protected static enum ColGroupType { /** * Main constructor. - * + * * @param colIndices offsets of the columns in the matrix block that make up the group */ protected AColGroup(IColIndex colIndices) { @@ -105,7 +105,7 @@ protected AColGroup(IColIndex colIndices) { /** * Obtain the offsets of the columns in the matrix block that make up the group - * + * * @return offsets of the columns in the matrix block that make up the group */ public final IColIndex getColIndices() { @@ -114,7 +114,7 @@ public final IColIndex getColIndices() { /** * Obtain the number of columns in this column group. - * + * * @return number of columns in this column group */ public final int getNumCols() { @@ -125,9 +125,9 @@ public final int getNumCols() { * Shift all column indexes contained by an offset. * * This is used for rbind to combine compressed matrices. - * + * * Since column indexes are reused between operations, we allocate a new list here to be safe - * + * * @param offset The offset to move all columns * @return A new column group object with the shifted columns */ @@ -139,7 +139,7 @@ public final AColGroup shiftColIndices(int offset) { * Copy the content of the column group with pointers to the previous content but with new column given Note this * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column * groups. - * + * * @param colIndexes the new indexes to use in the copy * @return a new object with pointers to underlying data. */ @@ -147,7 +147,7 @@ public final AColGroup shiftColIndices(int offset) { /** * Get the upper bound estimate of in memory allocation for the column group. - * + * * @return an upper bound on the number of bytes used to store this ColGroup in memory. */ public long estimateInMemorySize() { @@ -158,9 +158,9 @@ public long estimateInMemorySize() { /** * Decompress a range of rows into a sparse block - * + * * Note that this is using append, so the sparse column indexes need to be sorted afterwards. - * + * * @param sb Sparse Target block * @param rl Row to start at * @param ru Row to end at @@ -171,7 +171,7 @@ public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) { /** * Decompress a range of rows into a dense block - * + * * @param db Dense target block * @param rl Row to start at * @param ru Row to end at @@ -182,7 +182,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress a range of rows into a dense transposed block. - * + * * @param db Dense target block * @param rl Row in this column group to start at. * @param ru Row in this column group to end at. @@ -192,7 +192,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress the column group to the sparse transposed block. Note that the column groups would only need to * decompress into specific sub rows of the Sparse block - * + * * @param sb Sparse target block * @param nColOut The number of columns in the sb. */ @@ -200,7 +200,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Serializes column group to data output. - * + * * @param out data output * @throws IOException if IOException occurs */ @@ -213,7 +213,7 @@ protected void write(DataOutput out) throws IOException { /** * Returns the exact serialized size of column group. This can be used for example for buffer preallocation. - * + * * @return exact serialized size for column group */ public long getExactSizeOnDisk() { @@ -226,11 +226,11 @@ public long getExactSizeOnDisk() { /** * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If * the ColGroup slicing from does not contain any columns within the range null is returned. - * + * * @param cl The lower bound of the columns to select * @param cu The upper bound of the columns to select (not inclusive). * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced - * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. + * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. */ public final AColGroup sliceColumns(int cl, int cu) { if(cl <= _colIndexes.get(0) && cu > _colIndexes.get(_colIndexes.size() - 1)) { @@ -248,10 +248,10 @@ else if(cu - cl == 1) /** * Slice out a single column from the column group. - * + * * @param col The column to slice, the column could potentially not be inside the column group * @return A new column group that is a single column, if the column requested is not in this column group null is - * returned. + * returned. */ public final AColGroup sliceColumn(int col) { int idx = _colIndexes.findIndex(col); @@ -263,11 +263,11 @@ public final AColGroup sliceColumn(int col) { /** * Slice out multiple columns within the interval between the given indexes. - * + * * @param cl The lower column index to slice from * @param cu The upper column index to slice to, (not included) * @return A column group of this containing the columns specified, returns null if the columns specified is not - * contained in the column group + * contained in the column group */ protected final AColGroup sliceMultiColumns(int cl, int cu) { SliceResult sr = _colIndexes.slice(cl, cu); @@ -279,7 +279,7 @@ protected final AColGroup sliceMultiColumns(int cl, int cu) { /** * Compute the column sum of the given list of groups - * + * * @param groups The Groups to sum * @param res The result to put the values into * @param nRows The number of rows in the groups @@ -293,9 +293,9 @@ public static double[] colSum(Collection groups, double[] res, int nR /** * Get the value at a global row/column position. - * + * * In general this performs since a binary search of colIndexes is performed for each lookup. - * + * * @param r row * @param c column * @return value at the row/column position @@ -310,7 +310,7 @@ public double get(int r, int c) { /** * Get the value at a colGroup specific row/column index position. - * + * * @param r row * @param colIdx column index in the _colIndexes. * @return value at the row/column index position @@ -319,16 +319,16 @@ public double get(int r, int c) { /** * Obtain number of distinct tuples in contained sets of values associated with this column group. - * + * * If the column group is uncompressed the number or rows is returned. - * + * * @return the number of distinct sets of values associated with the bitmaps in this column group */ public abstract int getNumValues(); /** * Obtain the compression type. - * + * * @return How the elements of the column group are compressed. */ public abstract CompressionType getCompType(); @@ -336,14 +336,14 @@ public double get(int r, int c) { /** * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for * nice switches in the code. - * + * * @return ColGroupType of the object. */ protected abstract ColGroupType getColGroupType(); /** * Decompress into the DenseBlock. (no NNZ handling) - * + * * @param db Target DenseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -354,10 +354,10 @@ public double get(int r, int c) { /** * Decompress into the SparseBlock. (no NNZ handling) - * + * * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted * afterwards - * + * * @param sb Target SparseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -368,9 +368,9 @@ public double get(int r, int c) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @return The new Column Group or null that is the result of the matrix multiplication. */ @@ -380,9 +380,9 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use * full, can be set to null. @@ -393,7 +393,7 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right side Matrix multiplication, iterating though this column group and adding to the ret - * + * * @param right Right side matrix to multiply with. * @param ret The return matrix to add results to * @param rl The row of this column group to multiply from @@ -402,18 +402,20 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { * @param cru The right hand side column upper * @param nRows The number of rows in this column group */ - public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru){ - throw new NotImplementedException("not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); + public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, + int cru) { + throw new NotImplementedException( + "not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); } /** * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group. - * + * * This gives better performance since there is no need to iterate through all the rows of the matrix, but the * execution can be limited to its number of distinct values. - * + * * Note it only calculate the upper triangle - * + * * @param ret The return matrix block [numColumns x numColumns] * @param nRows The number of rows in the column group */ @@ -421,7 +423,7 @@ public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, i /** * Left multiply with this column group. - * + * * @param matrix The matrix to multiply with on the left * @param result The result to output the values into, always dense for the purpose of the column groups * parallelizing @@ -435,7 +437,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Left side matrix multiplication with a column group that is transposed. - * + * * @param lhs The left hand side Column group to multiply with, the left hand side should be considered * transposed. Also it should be guaranteed that this column group is not empty. * @param result The result matrix to insert the result of the multiplication into @@ -445,16 +447,16 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Matrix multiply with this other column group, but: - * + * * 1. Only output upper triangle values. - * + * * 2. Multiply both ways with "this" being on the left and on the right. - * + * * It should be guaranteed that the input is not the same as the caller of the method. - * + * * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct * locations in the output. - * + * * @param other The other Column group to multiply with * @param result The result matrix to put the results into */ @@ -463,7 +465,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform the specified scalar operation directly on the compressed column group, without decompressing individual * cells if possible. - * + * * @param op operation to perform * @return version of this column group with the operation applied */ @@ -471,7 +473,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -482,7 +484,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Short hand add operator call on column group to add a row vector to the column group - * + * * @param v The vector to add * @return A new column group where the vector is added. */ @@ -492,7 +494,7 @@ public AColGroup addVector(double[] v) { /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -504,9 +506,9 @@ public AColGroup addVector(double[] v) { /** * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed * matrix. - * + * * The range of rl to ru only applies to row aggregates. (ReduceCol) - * + * * @param op The operator used * @param c The output matrix block * @param nRows The total number of rows in the Column Group @@ -517,9 +519,9 @@ public AColGroup addVector(double[] v) { /** * Slice out column at specific index of this column group. - * + * * It is guaranteed that the column to slice is contained in this columnGroup. - * + * * @param idx The column index to slice out. * @return A new column group containing the columns inside. (never null) */ @@ -527,9 +529,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of columns inside this column group. - * + * * It is guaranteed that the columns to slice is contained in this columnGroup. - * + * * @param idStart The column index to start at * @param idEnd The column index to end at (not included) * @param outputCols The output columns to extract materialized for ease of implementation @@ -539,9 +541,10 @@ public AColGroup addVector(double[] v) { /** * Slice range of rows out of the column group and return a new column group only containing the row segment. - * - * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures. - * + * + * Note that this slice should maintain pointers back to the original dictionaries and only modify index + * structures. + * * @param rl The row to start at * @param ru The row to end at (not included) * @return A new column group containing the specified row range. @@ -550,21 +553,21 @@ public AColGroup addVector(double[] v) { /** * Short hand method for getting minimum value contained in this column group. - * + * * @return The minimum value contained in this ColumnGroup */ public abstract double getMin(); /** * Short hand method for getting maximum value contained in this column group. - * + * * @return The maximum value contained in this ColumnGroup */ public abstract double getMax(); /** * Short hand method for getting the sum of this column group - * + * * @param nRows The number of rows in the column group * @return The sum of this column group */ @@ -572,7 +575,7 @@ public AColGroup addVector(double[] v) { /** * Detect if the column group contains a specific value. - * + * * @param pattern The value to look for. * @return boolean saying true if the value is contained. */ @@ -580,7 +583,7 @@ public AColGroup addVector(double[] v) { /** * Get the number of nonZeros contained in this column group. - * + * * @param nRows The number of rows in the column group, this is used for groups that does not contain information * about how many rows they have. * @return The nnz. @@ -589,7 +592,7 @@ public AColGroup addVector(double[] v) { /** * Make a copy of the column group values, and replace all values that match pattern with replacement value. - * + * * @param pattern The value to look for * @param replace The value to replace the other value with * @return A new Column Group, reusing the index structure but with new values. @@ -598,7 +601,7 @@ public AColGroup addVector(double[] v) { /** * Compute the column sum - * + * * @param c The array to add the column sum to. * @param nRows The number of rows in the column group. */ @@ -606,7 +609,7 @@ public AColGroup addVector(double[] v) { /** * Central Moment instruction executed on a column group. - * + * * @param op The Operator to use. * @param nRows The number of rows contained in the ColumnGroup. * @return A Central Moment object. @@ -615,7 +618,7 @@ public AColGroup addVector(double[] v) { /** * Expand the column group to multiple columns. (one hot encode the column group) - * + * * @param max The number of columns to expand to and cutoff values at. * @param ignore If zero and negative values should be ignored. * @param cast If the double values contained should be cast to whole numbers. @@ -626,7 +629,7 @@ public AColGroup addVector(double[] v) { /** * Get the computation cost associated with this column group. - * + * * @param e The computation cost estimator * @param nRows the number of rows in the column group * @return The cost of this column group @@ -635,7 +638,7 @@ public AColGroup addVector(double[] v) { /** * Perform unary operation on the column group and return a new column group - * + * * @param op The operation to perform * @return The new column group */ @@ -643,19 +646,19 @@ public AColGroup addVector(double[] v) { /** * Get if the group is only containing zero - * + * * @return true if empty */ public abstract boolean isEmpty(); /** - * Append the other column group to this column group. This method tries to combine them to return a new column group - * containing both. In some cases it is possible in reasonable time, in others it is not. - * + * Append the other column group to this column group. This method tries to combine them to return a new column + * group containing both. In some cases it is possible in reasonable time, in others it is not. + * * The result is first this column group followed by the other column group in higher row values. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param g The other column group * @return A combined column group or null */ @@ -663,9 +666,9 @@ public AColGroup addVector(double[] v) { /** * Append all column groups in the list provided together in one go allocating the output once. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -677,11 +680,11 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Append all column groups in the list provided together with this. - * + * * A Important detail is the first entry in the group == this, and should not be appended twice. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -691,7 +694,7 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Get the compression scheme for this column group to enable compression of other data. - * + * * @return The compression scheme of this column group */ public abstract ICLAScheme getCompressionScheme(); @@ -705,14 +708,14 @@ public void clear() { /** * Recompress this column group into a new column group. - * + * * @return A new or the same column group depending on optimization goal. */ public abstract AColGroup recompress(); /** * Recompress this column group into a new column group of the given type. - * + * * @param ct The compressionType that the column group should morph into * @param nRow The number of rows in this columngroup. * @return A new column group @@ -742,7 +745,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Get the compression info for this column group. - * + * * @param nRow The number of rows in this column group. * @return The compression info for this group. */ @@ -750,7 +753,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Combine this column group with another - * + * * @param other The other column group to combine with. * @param nRow The number of rows in both column groups. * @return A combined representation as a column group. @@ -761,7 +764,7 @@ public AColGroup combine(AColGroup other, int nRow) { /** * Get encoding of this column group. - * + * * @return The encoding of the index structure. */ public IEncode getEncoding() { @@ -782,19 +785,19 @@ public AColGroup sortColumnIndexes() { /** * Perform row sum on the internal dictionaries, and return the same index structure. - * + * * This method returns null on empty column groups. - * + * * Note this method does not guarantee correct behavior if the given group is AMorphingGroup, instead it should be * morphed to a valid columngroup via extractCommon first. - * + * * @return The reduced colgroup. */ public abstract AColGroup reduceCols(); /** * Selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The MatrixBlock to decompress the selected rows into @@ -807,17 +810,17 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo else denseSelection(selection, points, ret, rl, ru); } - + /** * Get an approximate sparsity of this column group - * + * * @return the approximate sparsity of this columngroup */ public abstract double getSparsity(); /** * Sparse selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Sparse MatrixBlock to decompress the selected rows into @@ -828,7 +831,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Dense selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Dense MatrixBlock to decompress the selected rows into @@ -840,7 +843,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Method to determine if the columnGroup have the same index structure as another. Note that the column indexes and * dictionaries are allowed to be different. - * + * * @param that the other column group * @return if the index is the same. */ @@ -851,7 +854,7 @@ public boolean sameIndexStructure(AColGroup that) { /** * C bind the list of column groups with this column group. the list of elements provided in the index of each list * is guaranteed to have the same index structures - * + * * @param nRow The number of rows contained in all right and this column group. * @param nCol The number of columns to shift the right hand side column groups over when combining, this should * only effect the column indexes @@ -889,7 +892,7 @@ public AColGroup combineWithSameIndex(int nRow, int nCol, List right) /** * C bind the given column group to this. - * + * * @param nRow The number of rows contained in the right and this column group. * @param nCol The number of columns in this. * @param right The column group to c-bind. @@ -929,16 +932,16 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock. @@ -948,25 +951,25 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * This specific variation is pushing down the parallelization given via the executor service provided. If not * overwritten the default is to call the normal split reshape - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock * @param pool The executor service to submit parallel tasks to - * @throws Exception In case there is an error we throw the exception out instead of handling it * @return a list of split column groups + * @throws Exception In case there is an error we throw the exception out instead of handling it */ public AColGroup[] splitReshapePushDown(final int multiplier, final int nRow, final int nColOrg, final ExecutorService pool) throws Exception { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index c5de46b161c..67f2c492e09 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -107,7 +107,7 @@ private ColGroupFactory(MatrixBlock in, CompressedSizeInfo csi, CompressionSetti /** * The actual compression method, that handles the logic of compressing multiple columns together. - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -121,7 +121,7 @@ public static List compressColGroups(MatrixBlock in, CompressedSizeIn /** * The actual compression method, that handles the logic of compressing multiple columns together. - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -136,7 +136,7 @@ public static List compressColGroups(MatrixBlock in, CompressedSizeIn } /** - * + * * @param in The input matrix, that could have been transposed. If it is transposed the compSettings should specify * this. * @param csi The compression information extracted from the estimation, this contains which groups of columns to @@ -233,8 +233,9 @@ private void logEstVsActual(double time, AColGroup act, CompressedSizeInfoColGro time, retType, estC, actC, act.getNumValues(), cols, wanted, warning)); } else { - LOG.debug(String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", - time, retType, estC, actC, act.getNumValues(), cols, wanted)); + LOG.debug( + String.format("time[ms]: %10.2f %25s est %10.0f -- act %10.0f distinct:%5d cols:%s wanted:%s", time, + retType, estC, actC, act.getNumValues(), cols, wanted)); } } @@ -304,11 +305,11 @@ else if(ct == CompressionType.LinearFunctional) { return compressLinearFunctional(colIndexes, in, cs); } } - else if(ct == CompressionType.PiecewiseLinear) { + else if(ct == CompressionType.PiecewiseLinear) { - return compressPiecewiseLinearFunctional(colIndexes, in, cs); + return compressPiecewiseLinearFunctional(colIndexes, in, cs); - } + } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); if(g instanceof ColGroupDDC) @@ -704,7 +705,7 @@ private AColGroup directCompressDeltaDDC(IColIndex colIndexes, CompressedSizeInf if(cs.scaleFactors != null) { throw new NotImplementedException("Delta encoding with quantization not yet implemented"); } - + if(colIndexes.size() > 1) { return directCompressDeltaDDCMultiCol(colIndexes, cg); } @@ -736,7 +737,7 @@ private AColGroup directCompressDeltaDDCSingleCol(IColIndex colIndexes, Compress if(map.size() == 0) return new ColGroupEmpty(colIndexes); - + final double[] dictValues = map.getDictionary(); IDictionary dict = new DeltaDictionary(dictValues, 1); @@ -745,7 +746,8 @@ private AColGroup directCompressDeltaDDCSingleCol(IColIndex colIndexes, Compress return ColGroupDeltaDDC.create(colIndexes, dict, resData, null); } - private AColGroup directCompressDeltaDDCMultiCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) throws Exception { + private AColGroup directCompressDeltaDDCMultiCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) + throws Exception { final AMapToData d = MapToFactory.create(nRow, Math.max(Math.min(cg.getNumOffs() + 1, nRow), 126)); final int fill = d.getUpperBoundValue(); d.fill(fill); @@ -824,8 +826,8 @@ private boolean readToMapDDC(IColIndex colIndexes, DblArrayCountHashMap map, AMa int fill) { ReaderColumnSelection reader = (cs.scaleFactors == null) ? ReaderColumnSelection.createReader(in, colIndexes, - cs.transposed, rl, - ru) : ReaderColumnSelection.createQuantizedReader(in, colIndexes, cs.transposed, rl, ru, cs.scaleFactors); + cs.transposed, rl, ru) : ReaderColumnSelection.createQuantizedReader(in, colIndexes, cs.transposed, rl, ru, + cs.scaleFactors); DblArray cellVals = reader.nextRow(); boolean extra = false; @@ -1072,185 +1074,179 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } - public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { - - - //Erstmal den Inhalt einer Spalte speichern - - int numRows = in.getNumRows(); - int colIdx = colIndexes.get(0); //Die erste Spalte - double[] column = getColumn(in,colIdx); - - //Sette den Targetloss - - // Breakpoints bestimmen: Einteilung der Segmente - - List breakpointsList = computeBreakpoints(cs, column); - int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); - //Für jedes Segment lineare Regression als kompressionsverfahren - - // 3) Pro Segment Regression -> a,b - int numSeg = breakpoints.length - 1; - double[] slopes = new double[numSeg]; - double[] intercepts = new double[numSeg]; - - for (int s = 0; s < numSeg; s++) { - int start = breakpoints[s]; - int end = breakpoints[s + 1]; - - double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost - slopes[s] = ab[0]; - intercepts[s] = ab[1]; - } - //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed - - return ColGroupPiecewiseLinearCompressed.create( - colIndexes, - breakpoints, - slopes, - intercepts, - numRows); - } - - - public static double[] getColumn(MatrixBlock in, int colIndex) { - int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] - double[] column = new double[numRows]; // Variable für die Spalte - - for (int r = 0; r < numRows; r++) { - column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] - } - return column; - } - public static List computeBreakpoints(CompressionSettings cs, double[] column){ - int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); - // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ - if (Double.isNaN(targetMSE) || targetMSE <= 0) { - double lambda = 5.0; - return computeBreakpointsLambda(column, lambda); - } - - // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen - double sseMax = n * targetMSE; // MSE -> SSE-Budget - - double lambdaMin = 0.0; // viele Segmente, minimaler Fehler - double lambdaMax = 1e6; // wenige Segmente, mehr Fehler - - List bestBreaks = null; - - for (int it = 0; it < 20; it++) { // Binärsuche auf λ - double lambda = 0.5 * (lambdaMin + lambdaMax); - - List breaks = computeBreakpointsLambda(column, lambda); - double totalSSE = computeTotalSSE(column, breaks); - - if (totalSSE <= sseMax) { - // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen - bestBreaks = breaks; - lambdaMin = lambda; - } else { - // Fehler zu groß: λ verkleinern, mehr Segmente zulassen - lambdaMax = lambda; - } - } - - if (bestBreaks == null) - bestBreaks = computeBreakpointsLambda(column, lambdaMin); - - return bestBreaks; - } - public static List computeBreakpointsLambda(double[] column, double lambda) { - int sizeColumn = column.length; - double[] dp = new double[sizeColumn + 1]; - int[] prev = new int[sizeColumn + 1]; - - dp[0] = 0.0; - - for (int index = 1; index <= sizeColumn; index++) { - dp[index] = Double.POSITIVE_INFINITY; - for (int i = 0; i < index; i++) { // Segment [i, index) - double costCurrentSegment = computeSegmentCost(column, i, index); // SSE - double candidateCost = dp[i] + costCurrentSegment + lambda; - if (candidateCost < dp[index]) { - dp[index] = candidateCost; - prev[index] = i; - } - } - } - - List segmentLimits = new ArrayList<>(); - int breakpointIndex = sizeColumn; - while (breakpointIndex > 0) { - segmentLimits.add(breakpointIndex); - breakpointIndex = prev[breakpointIndex]; - } - segmentLimits.add(0); - Collections.sort(segmentLimits); - return segmentLimits; - } - - public static double computeSegmentCost(double[] column, int start, int end) { - int n = end - start; - if (n <= 1) - return 0.0; - - double[] ab = regressSegment(column, start, end); - double slope = ab[0]; - double intercept = ab[1]; - - double sse = 0.0; - for (int i = start; i < end; i++) { - double x = i; - double y = column[i]; - double yhat = slope * x + intercept; - double diff = y - yhat; - sse += diff * diff; - } - return sse; // oder sse / n als MSE - } - public static double computeTotalSSE(double[] column, List breaks) { - double total = 0.0; - for (int s = 0; s < breaks.size() - 1; s++) { - int start = breaks.get(s); - int end = breaks.get(s + 1); - total += computeSegmentCost(column, start, end); // SSE des Segments - } - return total; - } - - - public static double[] regressSegment(double[] column, int start, int end) { - int n = end - start; - if (n <= 0) - return new double[] {0.0, 0.0}; - - double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; - for (int i = start; i < end; i++) { - double x = i; - double y = column[i]; - sumX += x; - sumY += y; - sumXX += x * x; - sumXY += x * y; - } - - double nD = n; - double denom = nD * sumXX - sumX * sumX; - double slope, intercept; - if (denom == 0) { - slope = 0.0; - intercept = sumY / nD; - } - else { - slope = (nD * sumXY - sumX * sumY) / denom; - intercept = (sumY - slope * sumX) / nD; - } - return new double[] {slope, intercept}; - } + public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, + CompressionSettings cs) { + + //Erstmal den Inhalt einer Spalte speichern + + int numRows = in.getNumRows(); + int colIdx = colIndexes.get(0); //Die erste Spalte + double[] column = getColumn(in, colIdx); + + //Sette den Targetloss + + // Breakpoints bestimmen: Einteilung der Segmente + + List breakpointsList = computeBreakpoints(cs, column); + int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + //Für jedes Segment lineare Regression als kompressionsverfahren + + // 3) Pro Segment Regression -> a,b + int numSeg = breakpoints.length - 1; + double[] slopes = new double[numSeg]; + double[] intercepts = new double[numSeg]; + + for(int s = 0; s < numSeg; s++) { + int start = breakpoints[s]; + int end = breakpoints[s + 1]; + + double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost + slopes[s] = ab[0]; + intercepts[s] = ab[1]; + } + //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed + + return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpoints, slopes, intercepts, numRows); + } + + public static double[] getColumn(MatrixBlock in, int colIndex) { + int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] + double[] column = new double[numRows]; // Variable für die Spalte + + for(int r = 0; r < numRows; r++) { + column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] + } + return column; + } + + public static List computeBreakpoints(CompressionSettings cs, double[] column) { + int n = column.length; + double targetMSE = cs.getPiecewiseTargetLoss(); + // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ + if(Double.isNaN(targetMSE) || targetMSE <= 0) { + double lambda = 5.0; + return computeBreakpointsLambda(column, lambda); + } + + // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen + double sseMax = n * targetMSE; // MSE -> SSE-Budget + + double lambdaMin = 0.0; // viele Segmente, minimaler Fehler + double lambdaMax = 1e6; // wenige Segmente, mehr Fehler + + List bestBreaks = null; + for(int it = 0; it < 20; it++) { // Binärsuche auf λ + double lambda = 0.5 * (lambdaMin + lambdaMax); + List breaks = computeBreakpointsLambda(column, lambda); + double totalSSE = computeTotalSSE(column, breaks); + if(totalSSE <= sseMax) { + // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen + bestBreaks = breaks; + lambdaMin = lambda; + } + else { + // Fehler zu groß: λ verkleinern, mehr Segmente zulassen + lambdaMax = lambda; + } + } + + if(bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, lambdaMin); + + return bestBreaks; + } + + public static List computeBreakpointsLambda(double[] column, double lambda) { + int sizeColumn = column.length; + double[] dp = new double[sizeColumn + 1]; + int[] prev = new int[sizeColumn + 1]; + + dp[0] = 0.0; + + for(int index = 1; index <= sizeColumn; index++) { + dp[index] = Double.POSITIVE_INFINITY; + for(int i = 0; i < index; i++) { // Segment [i, index) + double costCurrentSegment = computeSegmentCost(column, i, index); // SSE + double candidateCost = dp[i] + costCurrentSegment + lambda; + if(candidateCost < dp[index]) { + dp[index] = candidateCost; + prev[index] = i; + } + } + } + + List segmentLimits = new ArrayList<>(); + int breakpointIndex = sizeColumn; + while(breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prev[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + int n = end - start; + if(n <= 1) + return 0.0; + + double[] ab = regressSegment(column, start, end); + double slope = ab[0]; + double intercept = ab[1]; + + double sse = 0.0; + for(int i = start; i < end; i++) { + double x = i; + double y = column[i]; + double yhat = slope * x + intercept; + double diff = y - yhat; + sse += diff * diff; + } + return sse; // oder sse / n als MSE + } + + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for(int s = 0; s < breaks.size() - 1; s++) { + int start = breaks.get(s); + int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); // SSE des Segments + } + return total; + } + + public static double[] regressSegment(double[] column, int start, int end) { + int n = end - start; + if(n <= 0) + return new double[] {0.0, 0.0}; + + double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; + for(int i = start; i < end; i++) { + double x = i; + double y = column[i]; + sumX += x; + sumY += y; + sumXX += x * x; + sumXY += x * y; + } + + double nD = n; + double denom = nD * sumXX - sumX * sumX; + double slope, intercept; + if(denom == 0) { + slope = 0.0; + intercept = sumY / nD; + } + else { + slope = (nD * sumXY - sumX * sumY) / denom; + intercept = (sumY - slope * sumX) / nD; + } + return new double[] {slope, intercept}; + } private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { if(cols.size() > 1) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java index 4062c4da611..1f39dc44cb0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java @@ -21,381 +21,372 @@ public class ColGroupPiecewiseLinearCompressed extends AColGroupCompressed { - IColIndex colIndexes; - int[] breakpoints; - double[] slopes; - double[] intercepts; - int numRows; + IColIndex colIndexes; + int[] breakpoints; + double[] slopes; + double[] intercepts; + int numRows; - protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { - super(colIndices); - } + protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { + super(colIndices); + } + public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, + double[] intercepts, int numRows) { + super(colIndexes); + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + this.numRows = numRows; + } - public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { - super(colIndexes); - this.breakpoints = breakpoints; - this.slopes = slopes; - this.intercepts = intercepts; - this.numRows = numRows; - } + public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, + int numRows) { + if(breakpoints == null || breakpoints.length < 2) + throw new IllegalArgumentException("Need at least one segment"); + int numSeg = breakpoints.length - 1; + if(slopes.length != numSeg || intercepts.length != numSeg) + throw new IllegalArgumentException("Inconsistent segment arrays"); - public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { - if (breakpoints == null || breakpoints.length < 2) - throw new IllegalArgumentException("Need at least one segment"); + int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); + double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); + double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); - int numSeg = breakpoints.length - 1; - if (slopes.length != numSeg || intercepts.length != numSeg) - throw new IllegalArgumentException("Inconsistent segment arrays"); + return new ColGroupPiecewiseLinearCompressed(colIndexes, bpCopy, slopeCopy, interceptCopy, numRows); - int[] bpCopy = Arrays.copyOf(breakpoints, breakpoints.length); - double[] slopeCopy = Arrays.copyOf(slopes, slopes.length); - double[] interceptCopy = Arrays.copyOf(intercepts, intercepts.length); + } + @Override + public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - return new ColGroupPiecewiseLinearCompressed( - colIndexes, - bpCopy, - slopeCopy, - interceptCopy, - numRows); + if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || + intercepts == null) { + return; + } - } + int numSeg = breakpoints.length - 1; + if(numSeg <= 0 || rl >= ru) { + return; + } - @Override - public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { + final int col = _colIndexes.get(0); - if (db == null || _colIndexes == null || _colIndexes.size() == 0 || - breakpoints == null || slopes == null || intercepts == null) { - return; - } + for(int s = 0; s < numSeg; s++) { + int segStart = breakpoints[s]; + int segEnd = breakpoints[s + 1]; + if(segStart >= segEnd) + continue; // Invalid Segment - int numSeg = breakpoints.length - 1; - if (numSeg <= 0 || rl >= ru) { - return; - } + double a = slopes[s]; + double b = intercepts[s]; - final int col = _colIndexes.get(0); + int rs = Math.max(segStart, rl); + int re = Math.min(segEnd, ru); + if(rs >= re) + continue; - for (int s = 0; s < numSeg; s++) { - int segStart = breakpoints[s]; - int segEnd = breakpoints[s + 1]; - if (segStart >= segEnd) continue; // Invalid Segment + for(int r = rs; r < re; r++) { + double yhat = a * r + b; + int gr = offR + r; + int gc = offC + col; - double a = slopes[s]; - double b = intercepts[s]; + if(gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { + db.set(gr, gc, yhat); + } + } + } + } - int rs = Math.max(segStart, rl); - int re = Math.min(segEnd, ru); - if (rs >= re) continue; + @Override + protected double computeMxx(double c, Builtin builtin) { + return 0; + } - for (int r = rs; r < re; r++) { - double yhat = a * r + b; - int gr = offR + r; - int gc = offC + col; + @Override + protected void computeColMxx(double[] c, Builtin builtin) { + } - if (gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { - db.set(gr, gc, yhat); - } - } - } - } + @Override + protected void computeSum(double[] c, int nRows) { - @Override - protected double computeMxx(double c, Builtin builtin) { - return 0; - } + } - @Override - protected void computeColMxx(double[] c, Builtin builtin) { + @Override + protected void computeSumSq(double[] c, int nRows) { - } + } - @Override - protected void computeSum(double[] c, int nRows) { + @Override + protected void computeColSumsSq(double[] c, int nRows) { - } + } - @Override - protected void computeSumSq(double[] c, int nRows) { + @Override + protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeColSumsSq(double[] c, int nRows) { + @Override + protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + @Override + protected void computeProduct(double[] c, int nRows) { - } + } - @Override - protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + @Override + protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { - } + } - @Override - protected void computeProduct(double[] c, int nRows) { + @Override + protected void computeColProduct(double[] c, int nRows) { - } + } - @Override - protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + @Override + protected double[] preAggSumRows() { + return new double[0]; + } - } + @Override + protected double[] preAggSumSqRows() { + return new double[0]; + } - @Override - protected void computeColProduct(double[] c, int nRows) { + @Override + protected double[] preAggProductRows() { + return new double[0]; + } - } + @Override + protected double[] preAggBuiltinRows(Builtin builtin) { + return new double[0]; + } - @Override - protected double[] preAggSumRows() { - return new double[0]; - } + @Override + public boolean sameIndexStructure(AColGroupCompressed that) { + return false; + } - @Override - protected double[] preAggSumSqRows() { - return new double[0]; - } + @Override + protected void tsmm(double[] result, int numColumns, int nRows) { - @Override - protected double[] preAggProductRows() { - return new double[0]; - } + } - @Override - protected double[] preAggBuiltinRows(Builtin builtin) { - return new double[0]; - } + @Override + public AColGroup copyAndSet(IColIndex colIndexes) { + return null; + } - @Override - public boolean sameIndexStructure(AColGroupCompressed that) { - return false; - } + @Override + public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { - @Override - protected void tsmm(double[] result, int numColumns, int nRows) { + } - } + @Override + public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { - @Override - public AColGroup copyAndSet(IColIndex colIndexes) { - return null; - } + } - @Override - public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + @Override + public double getIdx(int r, int colIdx) { + // ✅ CRUCIAL: Bounds-Check für colIdx! + if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { + return 0.0; + } - } + // Segment-Suche (sicher jetzt) + int seg = 0; + for(int i = 1; i < breakpoints.length; i++) { + if(r < breakpoints[i]) { + break; + } + seg = i - 1; // seg < numSeg immer! + } - @Override - public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + return slopes[seg] * (double) r + intercepts[seg]; + } - } + @Override + public int getNumValues() { + return breakpoints.length + slopes.length + intercepts.length; + } - @Override - public double getIdx(int r, int colIdx) { - // ✅ CRUCIAL: Bounds-Check für colIdx! - if (r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { - return 0.0; - } + @Override + public CompressionType getCompType() { + return null; + } - // Segment-Suche (sicher jetzt) - int seg = 0; - for (int i = 1; i < breakpoints.length; i++) { - if (r < breakpoints[i]) { - break; - } - seg = i - 1; // seg < numSeg immer! - } + @Override + protected ColGroupType getColGroupType() { + return null; + } - return slopes[seg] * (double) r + intercepts[seg]; - } + @Override + public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - @Override - public int getNumValues() { - return breakpoints.length + slopes.length + intercepts.length; - } + } - @Override - public CompressionType getCompType() { - return null; - } + @Override + public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { + return null; + } - @Override - protected ColGroupType getColGroupType() { - return null; - } + @Override + public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + + } + + @Override + public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + + } + + @Override + public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + + } + + @Override + public AColGroup scalarOperation(ScalarOperator op) { + return null; + } + + @Override + public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { + return null; + } + + @Override + protected AColGroup sliceSingleColumn(int idx) { + return null; + } + @Override + protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { + return null; + } + + @Override + public AColGroup sliceRows(int rl, int ru) { + return null; + } + + @Override + public boolean containsValue(double pattern) { + return false; + } + + @Override + public long getNumberNonZeros(int nRows) { + return 0; + } + + @Override + public AColGroup replace(double pattern, double replace) { + return null; + } + + @Override + public void computeColSums(double[] c, int nRows) { + + } + + @Override + public CmCovObject centralMoment(CMOperator op, int nRows) { + return null; + } + + @Override + public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { + return null; + } + + @Override + public double getCost(ComputationCostEstimator e, int nRows) { + return 0; + } + + @Override + public AColGroup unaryOperation(UnaryOperator op) { + return null; + } + @Override + public AColGroup append(AColGroup g) { + return null; + } - @Override - public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - - } - - @Override - public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { - return null; - } - - @Override - public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { - - } - - @Override - public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { - - } - - @Override - public void tsmmAColGroup(AColGroup other, MatrixBlock result) { - - } - - @Override - public AColGroup scalarOperation(ScalarOperator op) { - return null; - } - - @Override - public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; - } - - @Override - public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; - } - - @Override - protected AColGroup sliceSingleColumn(int idx) { - return null; - } - - @Override - protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { - return null; - } - - @Override - public AColGroup sliceRows(int rl, int ru) { - return null; - } - - @Override - public boolean containsValue(double pattern) { - return false; - } - - @Override - public long getNumberNonZeros(int nRows) { - return 0; - } - - @Override - public AColGroup replace(double pattern, double replace) { - return null; - } - - @Override - public void computeColSums(double[] c, int nRows) { - - } - - @Override - public CmCovObject centralMoment(CMOperator op, int nRows) { - return null; - } - - - @Override - public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { - return null; - } - - @Override - public double getCost(ComputationCostEstimator e, int nRows) { - return 0; - } - - @Override - public AColGroup unaryOperation(UnaryOperator op) { - return null; - } - - @Override - public AColGroup append(AColGroup g) { - return null; - } - - @Override - protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { - return null; - } - - @Override - public ICLAScheme getCompressionScheme() { - return null; - } - - @Override - public AColGroup recompress() { - return null; - } - - @Override - public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { - return null; - } - - @Override - protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { - return null; - } - - @Override - public AColGroup reduceCols() { - return null; - } - - @Override - public double getSparsity() { - return 0; - } - - @Override - protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - - } - - @Override - protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - - } - - @Override - public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { - return new AColGroup[0]; - } - - public int[] getBreakpoints() { - return breakpoints; - } - - public double[] getSlopes() { - return slopes; - } - - - public double[] getIntercepts() { - return intercepts; - } + @Override + protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { + return null; + } + + @Override + public ICLAScheme getCompressionScheme() { + return null; + } + + @Override + public AColGroup recompress() { + return null; + } + + @Override + public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { + return null; + } + + @Override + protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { + return null; + } + + @Override + public AColGroup reduceCols() { + return null; + } + + @Override + public double getSparsity() { + return 0; + } + + @Override + protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { + + } + + @Override + public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { + return new AColGroup[0]; + } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } } diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 5b3688be5b1..4f309fda967 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -11,696 +11,689 @@ import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.junit.Test; + import java.util.Arrays; import java.util.List; + import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; import static org.junit.Assert.*; - public class ColGroupPiecewiseLinearCompressedTest { + @Test + public void testComputeBreakpoints_uniformColumn() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks + } + + @Test + public void testComputeBreakpoints_linearIncreasing() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // Erwartet + + } + + @Test + public void testComputeBreakpoints_highLoss_uniform() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(10000.0); + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeBreakpoints_twoSegments() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-3); + // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; + var breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 3, 6), breaks); + } + + @Test + public void testComputeBreakpoints_noLoss_linear() { + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(0.0); + //cs.setPiecewiseTargetLoss(0.0); + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = computeBreakpoints(cs, column); + assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks + } + + @Test + public void testComputeBreakpointsLambda_const() { + double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; + List breaks = computeBreakpointsLambda(column, 5.0); + assertEquals(Arrays.asList(0, 5), breaks); + + breaks = computeBreakpointsLambda(column, 0.01); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeBreakpointsLambda_twoSegments() { + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte + + // mit kleinem lambda -> viele Segmente (kostenlos fast) + List breaks = computeBreakpointsLambda(column, 0.01); + assertTrue(breaks.contains(3)); + assertEquals(3, breaks.size()); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // mit großem lambda entspricht nur ein Segment + breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_jumpWithTrend() { + double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; + + // grobe Segmentanpassung: ein Segment pro „Abschnitt“ + List breaks = computeBreakpointsLambda(column, 0.5); + assertEquals(Arrays.asList(0, 3, 6), breaks); + + // nur ein Segment, wenn lambda sehr groß + breaks = computeBreakpointsLambda(column, 100.0); + assertEquals(Arrays.asList(0, 6), breaks); + } + + @Test + public void testComputeBreakpointsLambda_linear() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + + List breaks = computeBreakpointsLambda(column, 1.0); + assertEquals(Arrays.asList(0, 6), breaks); + + // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind + breaks = computeBreakpointsLambda(column, 0.001); + assertTrue(breaks.size() >= 2); + assertTrue(breaks.get(0) == 0); + assertTrue(breaks.get(breaks.size() - 1) == column.length); + } + + @Test + public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; + + List breaks = computeBreakpointsLambda(column, 0.001); + assertNotNull(breaks); + assertFalse(breaks.isEmpty()); + assertEquals(0, (int) breaks.get(0)); + assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); + + // Prüfe, dass die Liste sortiert ist + for(int i = 1; i < breaks.size(); i++) { + assertTrue(breaks.get(i) >= breaks.get(i - 1)); + } + } + + @Test + public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; + + List breaks = computeBreakpointsLambda(column, 1000.0); + assertEquals(Arrays.asList(0, 5), breaks); + } + + @Test + public void testComputeSegmentCost_emptyOrSingle() { + double[] column = {10.0, 20.0, 30.0}; + + // 0 Elemente (leer) + assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); + + // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 + assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); + assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); + } + + @Test + public void testComputeSegmentCost_twoConstantPoints() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + + // Zwei identische Punkte (konstant) → SSE = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_twoDifferentPoints() { + double[] column = {0.0, 2.0, 1.0, 3.0}; + + // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 + double sse = computeSegmentCost(column, 0, 2); + assertEquals(0.0, sse, 1e-10); + + // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 + sse = computeSegmentCost(column, 2, 4); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_constantThree() { + double[] column = {0.0, 0.0, 0.0}; + double sse = computeSegmentCost(column, 0, 3); + assertEquals(0.0, sse, 1e-10); + } + + @Test + public void testComputeSegmentCost_consistent_with_regression() { + double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; + + int start = 0, end = 3; + double[] ab = regressSegment(column, start, end); + double slope = ab[0], intercept = ab[1]; + double sse_hand = 0.0; + for(int i = start; i < end; i++) { + double yhat = slope * i + intercept; + double diff = column[i] - yhat; + sse_hand += diff * diff; + } + + double sse = computeSegmentCost(column, start, end); + assertEquals(sse_hand, sse, 1e-10); + } + + @Test + public void testComputeTotalSSE_emptyBreaks() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(); // leer → keine Segmente + double total = computeTotalSSE(column, breaks); + + // 0 Segmente → Summe über 0 Segmente = 0 + assertEquals(0.0, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_singleSegment_all() { + double[] column = {1.0, 2.0, 3.0}; + List breaks = Arrays.asList(0, 3); // ein Segment [0,3) + + double total = computeTotalSSE(column, breaks); + double expected = computeSegmentCost(column, 0, 3); + + // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein + assertEquals(expected, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_twoSegments() { + // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) + double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; + List breaks = Arrays.asList(0, 3, 6); // zwei Segmente + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 + + // da beide Segmente konstant sind, muss totalSSE = 0 sein + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_threeSegments() { + // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten + double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; + List breaks = Arrays.asList(0, 3, 5, 7); + + // Segment [0,3): konstant 1.0 → SSE = 0 + double sse1 = computeSegmentCost(column, 0, 3); // 0 + + // Segment [3,5): [2,2] → SSE = 0 + double sse2 = computeSegmentCost(column, 3, 5); // 0 + + // Segment [5,7): [3,3] → SSE = 0 + double sse3 = computeSegmentCost(column, 5, 7); // 0 + + double total = computeTotalSSE(column, breaks); + assertEquals(0.0, total, 1e-10); + assertEquals(sse1 + sse2 + sse3, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_gapStartEnd() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + List breaks = Arrays.asList(2, 5, 8); + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 2, 5); + double sse2 = computeSegmentCost(column, 5, 8); + + assertEquals(sse1 + sse2, total, 1e-10); + + } + + @Test + public void testComputeTotalSSE_oneSegment_identical() { + double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; + double sseTotal = computeSegmentCost(column, 0, 5); + + List breaks = Arrays.asList(0, 5); + double total = computeTotalSSE(column, breaks); + + assertEquals(sseTotal, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_nonConstant() { + double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; + List breaks = Arrays.asList(0, 2, 5); + + double total = computeTotalSSE(column, breaks); + double sse1 = computeSegmentCost(column, 0, 2); + double sse2 = computeSegmentCost(column, 2, 5); + + assertTrue(total >= 0.0); + assertEquals(sse1 + sse2, total, 1e-10); + } + + @Test + public void testComputeTotalSSE_edgeCases() { + double[] columnEmpty = {}; + List breaksEmpty = Arrays.asList(0, 0); + assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); + + double[] columnOne = {42.0}; + List breaksOne = Arrays.asList(0, 1); + double total = computeTotalSSE(columnOne, breaksOne); + assertEquals(0.0, total, 1e-10); + } + + @Test + public void testRegressSegment_empty() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 0, 0); + assertEquals(0.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_singlePoint() { + double[] column = {1.0, 2.0, 3.0}; + double[] result = regressSegment(column, 1, 2); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(2.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoIdentical() { + double[] column = {5.0, 5.0, 1.0, 1.0}; + double[] result = regressSegment(column, 0, 2); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(5.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoPoints() { + double[] column = {0.0, 2.0}; + double[] result = regressSegment(column, 0, 2); + + assertEquals(2.0, result[0], 1e-10); + assertEquals(0.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_twoPoints_offset() { + + double[] column = {1.0, 3.0, 5.0, 7.0}; + double[] result = regressSegment(column, 2, 4); + + assertEquals(2.0, result[0], 1e-10); + assertEquals(1.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_constant() { + double[] column = {3.0, 3.0, 3.0, 3.0}; + double[] result = regressSegment(column, 0, 4); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(3.0, result[1], 1e-10); + } + + @Test + public void testRegressSegment_linear() { + double[] column = new double[4]; + double a = 1.5, b = 2.0; + for(int i = 0; i < 4; i++) { + column[i] = a * i + b; + } + + double[] result = regressSegment(column, 0, 4); + + assertEquals(a, result[0], 1e-10); + assertEquals(b, result[1], 1e-10); + } + + @Test + public void testRegressSegment_denomZero() { + double[] column = {10.0}; + double[] result = regressSegment(column, 0, 1); + + assertEquals(0.0, result[0], 1e-10); + assertEquals(10.0, result[1], 1e-10); + } + + @Test + public void testCompressPiecewiseLinearFunctional_const() { + // 1. MatrixBlock mit einer konstanten Spalte erzeugen + int nrows = 20, ncols = 1; + MatrixBlock in = new MatrixBlock(nrows, ncols, false); + for(int r = 0; r < nrows; r++) + in.set(r, 0, 1.0); + // 2. colIndexes für Spalte 0 + IColIndex colIndexes = ColIndexFactory.create(new int[] {0}); + // 3. CompressionSettings mit TargetLoss + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + // 4. Aufruf der Kompressionsfunktion + AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); + + // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? + assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); + ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; + + // 6. Breakpoints per Getter, nicht per create() + int[] breakpoints = plGroup.getBreakpoints(); + assertArrayEquals(new int[] {0, 20}, breakpoints); + + // 7. Pro Segment: 1 Segment → ein slope, ein intercept + double[] slopes = plGroup.getSlopes(); + double[] intercepts = plGroup.getIntercepts(); + assertEquals(1, slopes.length); + assertEquals(1, intercepts.length); + + // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 + assertEquals(0.0, slopes[0], 1e-10); + assertEquals(1.0, intercepts[0], 1e-10); + + // 9. Check: colIndexes stimmt + IColIndex idx = plGroup.getColIndices(); + assertEquals(1, idx.size()); + assertEquals(0, idx.get(0)); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_nullBreakpoints() { + int[] nullBp = null; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), nullBp, new double[] {1.0}, + new double[] {0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_tooFewBreakpoints() { + int[] singleBp = {0}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, new double[] {1.0}, + new double[] {0.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentSlopes() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, + new double[] {1.0, 2.0, 3.0}, new double[] {0.0, 1.0}, 10); + } + + @Test(expected = IllegalArgumentException.class) + public void testCreate_inconsistentIntercepts() { + int[] bp = {0, 5, 10}; + ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0}, + new double[] {0.0}, 10); + } + + @Test + public void testCreate_validMultiSegment() { + int[] bp = {0, 3, 7, 10}; + double[] slopes = {1.0, -2.0, 0.5}; + double[] intercepts = {0.0, 5.0, -1.0}; + IColIndex cols = ColIndexFactory.create(new int[] {0, 1}); + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); + } + + @Test + public void testCreate_multiColumn() { + IColIndex cols = ColIndexFactory.create(new int[] {5, 10, 15}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // + assertTrue(cg.getNumValues() > 0); + + for(int r = 0; r < 5; r++) { + double expected = 3.0 * r + 2.0; + // colIdx=0 → globale Spalte 5 + assertEquals(expected, cg.getIdx(r, 0), 1e-9); + // colIdx=1 → globale Spalte 10 + assertEquals(expected, cg.getIdx(r, 1), 1e-9); + // colIdx=2 → globale Spalte 15 + assertEquals(expected, cg.getIdx(r, 2), 1e-9); + } + + for(int r = 5; r < 10; r++) { + double expected = 3.0 * r + 2.0; + assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich + } + assertEquals(cols.size(), 3); + } + + @Test + public void testCreate_singleColumn() { + IColIndex cols = ColIndexFactory.create(new int[] {5}); + int[] bp = {0, 5}; + double[] slopes = {3.0}; + double[] intercepts = {2.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 + assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 + } + + @Test + public void testCreate_validMinimal() { + + // 1 Segment: [0,10] → y = 2.0 * r + 1.0 + int[] bp = {0, 10}; + double[] slopes = {2.0}; + double[] intercepts = {1.0}; + IColIndex cols = ColIndexFactory.create(new int[] {0}); + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); + + // Korrekte Instanz + assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); + + // getNumValues() > 0 + assertTrue(cg.getNumValues() > 0); + + // r < numRows + for(int r = 0; r < numRows; r++) { + double expected = 2.0 * r + 1.0; + assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); + } + + // Letzte gültige Row + assertEquals(19.0, cg.getIdx(9, 0), 1e-9); + + //Out-of-Bounds korrekt 0.0 + assertEquals(0.0, cg.getIdx(10, 0), 1e-9); + assertEquals(0.0, cg.getIdx(9, 1), 1e-9); + } + + @Test + public void testDecompressToDenseBlock() { + int[] bp = {0, 5, 10}; + double[] slopes = {1.0, 2.0}; + double[] intercepts = {0.0, 1.0}; + int numRows = 10; + + AColGroup cg = ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, slopes, + intercepts, numRows); + + // 1. MatrixBlock mit korrekten Dimensionen + MatrixBlock target = new MatrixBlock(numRows, 1, false); + + // 2. DenseBlock ZUERST alloziieren! + target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); + + // 3. Jetzt DenseBlock verfügbar + DenseBlock db = target.getDenseBlock(); + assertNotNull(db); // Sicherstellen! + + // 4. Dekomprimieren + cg.decompressToDenseBlock(db, 0, numRows, 0, 0); + + // 5. Prüfen + for(int r = 0; r < numRows; r++) { + double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; + assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); + } + } + + private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { + int[] bp = {0, 5, numRows}; + double[] slopes = {1.0, 3.0}; + double[] intercepts = {0.0, 2.0}; + return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( + ColIndexFactory.create(new int[] {0}), bp, slopes, intercepts, numRows); + } + + @Test + public void testDecompressToDenseBlock_fullRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + cg.decompressToDenseBlock(db, 0, 12, 0, 0); + + // Segment 0 [0,5): y = r + assertEquals(0.0, db.get(0, 0), 1e-9); + assertEquals(4.0, db.get(4, 0), 1e-9); - @Test - public void testComputeBreakpoints_uniformColumn() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet: keine Breaks - } - - @Test - public void testComputeBreakpoints_linearIncreasing() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // Erwartet - - } - - @Test - public void testComputeBreakpoints_highLoss_uniform() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10000.0); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeBreakpoints_twoSegments() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-3); - // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; - var breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 3, 6), breaks); - } - - @Test - public void testComputeBreakpoints_noLoss_linear() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(0.0); - //cs.setPiecewiseTargetLoss(0.0); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks - } - - @Test - public void testComputeBreakpointsLambda_const() { - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpointsLambda(column, 5.0); - assertEquals(Arrays.asList(0, 5), breaks); - - breaks = computeBreakpointsLambda(column, 0.01); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeBreakpointsLambda_twoSegments() { - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte - - // mit kleinem lambda -> viele Segmente (kostenlos fast) - List breaks = computeBreakpointsLambda(column, 0.01); - assertTrue(breaks.contains(3)); - assertEquals(3, breaks.size()); - assertEquals(Arrays.asList(0, 3, 6), breaks); - - // mit großem lambda entspricht nur ein Segment - breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 6), breaks); - } - - @Test - public void testComputeBreakpointsLambda_jumpWithTrend() { - double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; - - // grobe Segmentanpassung: ein Segment pro „Abschnitt“ - List breaks = computeBreakpointsLambda(column, 0.5); - assertEquals(Arrays.asList(0, 3, 6), breaks); - - // nur ein Segment, wenn lambda sehr groß - breaks = computeBreakpointsLambda(column, 100.0); - assertEquals(Arrays.asList(0, 6), breaks); - } - - @Test - public void testComputeBreakpointsLambda_linear() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; - - List breaks = computeBreakpointsLambda(column, 1.0); - assertEquals(Arrays.asList(0, 6), breaks); - - // mit sehr kleinem lambda: wir prüfen nur, dass die Grenzen vernünftig sind - breaks = computeBreakpointsLambda(column, 0.001); - assertTrue(breaks.size() >= 2); - assertTrue(breaks.get(0) == 0); - assertTrue(breaks.get(breaks.size() - 1) == column.length); - } - - @Test - public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { - double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; - - List breaks = computeBreakpointsLambda(column, 0.001); - assertNotNull(breaks); - assertFalse(breaks.isEmpty()); - assertEquals(0, (int) breaks.get(0)); - assertEquals(column.length, (int) breaks.get(breaks.size() - 1)); - - // Prüfe, dass die Liste sortiert ist - for (int i = 1; i < breaks.size(); i++) { - assertTrue(breaks.get(i) >= breaks.get(i - 1)); - } - } - - @Test - public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { - double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; - - List breaks = computeBreakpointsLambda(column, 1000.0); - assertEquals(Arrays.asList(0, 5), breaks); - } - - @Test - public void testComputeSegmentCost_emptyOrSingle() { - double[] column = {10.0, 20.0, 30.0}; - - // 0 Elemente (leer) - assertEquals(0.0, computeSegmentCost(column, 0, 0), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 1), 1e-10); - - // 1 Element → Regressionsgerade ist nicht eindeutig definiert, aber SSE=0 - assertEquals(0.0, computeSegmentCost(column, 0, 1), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 1, 2), 1e-10); - assertEquals(0.0, computeSegmentCost(column, 2, 3), 1e-10); - } - - @Test - public void testComputeSegmentCost_twoConstantPoints() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - - // Zwei identische Punkte (konstant) → SSE = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_twoDifferentPoints() { - double[] column = {0.0, 2.0, 1.0, 3.0}; - - // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 - double sse = computeSegmentCost(column, 0, 2); - assertEquals(0.0, sse, 1e-10); - - // Zwei Punkte: (2,1) und (3,3) → Gerade y = 2*x - 3, Fehler = 0 - sse = computeSegmentCost(column, 2, 4); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_constantThree() { - double[] column = {0.0, 0.0, 0.0}; - double sse = computeSegmentCost(column, 0, 3); - assertEquals(0.0, sse, 1e-10); - } - - @Test - public void testComputeSegmentCost_consistent_with_regression() { - double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; - - int start = 0, end = 3; - double[] ab = regressSegment(column, start, end); - double slope = ab[0], intercept = ab[1]; - double sse_hand = 0.0; - for (int i = start; i < end; i++) { - double yhat = slope * i + intercept; - double diff = column[i] - yhat; - sse_hand += diff * diff; - } - - double sse = computeSegmentCost(column, start, end); - assertEquals(sse_hand, sse, 1e-10); - } - - @Test - public void testComputeTotalSSE_emptyBreaks() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(); // leer → keine Segmente - double total = computeTotalSSE(column, breaks); - - // 0 Segmente → Summe über 0 Segmente = 0 - assertEquals(0.0, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_singleSegment_all() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(0, 3); // ein Segment [0,3) - - double total = computeTotalSSE(column, breaks); - double expected = computeSegmentCost(column, 0, 3); - - // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein - assertEquals(expected, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_twoSegments() { - // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) - double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; - List breaks = Arrays.asList(0, 3, 6); // zwei Segmente - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 3); // [0,0,0] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 6); // [1,1,1] → SSE = 0 - - // da beide Segmente konstant sind, muss totalSSE = 0 sein - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_threeSegments() { - // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten - double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; - List breaks = Arrays.asList(0, 3, 5, 7); - - // Segment [0,3): konstant 1.0 → SSE = 0 - double sse1 = computeSegmentCost(column, 0, 3); // 0 - - // Segment [3,5): [2,2] → SSE = 0 - double sse2 = computeSegmentCost(column, 3, 5); // 0 - - // Segment [5,7): [3,3] → SSE = 0 - double sse3 = computeSegmentCost(column, 5, 7); // 0 - - double total = computeTotalSSE(column, breaks); - assertEquals(0.0, total, 1e-10); - assertEquals(sse1 + sse2 + sse3, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_gapStartEnd() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; - List breaks = Arrays.asList(2, 5, 8); - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 2, 5); - double sse2 = computeSegmentCost(column, 5, 8); - - assertEquals(sse1 + sse2, total, 1e-10); - - } - - @Test - public void testComputeTotalSSE_oneSegment_identical() { - double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; - double sseTotal = computeSegmentCost(column, 0, 5); - - List breaks = Arrays.asList(0, 5); - double total = computeTotalSSE(column, breaks); - - assertEquals(sseTotal, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_nonConstant() { - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = Arrays.asList(0, 2, 5); - - double total = computeTotalSSE(column, breaks); - double sse1 = computeSegmentCost(column, 0, 2); - double sse2 = computeSegmentCost(column, 2, 5); - - assertTrue(total >= 0.0); - assertEquals(sse1 + sse2, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_edgeCases() { - double[] columnEmpty = {}; - List breaksEmpty = Arrays.asList(0, 0); - assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); - - double[] columnOne = {42.0}; - List breaksOne = Arrays.asList(0, 1); - double total = computeTotalSSE(columnOne, breaksOne); - assertEquals(0.0, total, 1e-10); - } - - @Test - public void testRegressSegment_empty() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 0, 0); - assertEquals(0.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_singlePoint() { - double[] column = {1.0, 2.0, 3.0}; - double[] result = regressSegment(column, 1, 2); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(2.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoIdentical() { - double[] column = {5.0, 5.0, 1.0, 1.0}; - double[] result = regressSegment(column, 0, 2); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(5.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoPoints() { - double[] column = {0.0, 2.0}; - double[] result = regressSegment(column, 0, 2); - - assertEquals(2.0, result[0], 1e-10); - assertEquals(0.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_twoPoints_offset() { - - double[] column = {1.0, 3.0, 5.0, 7.0}; - double[] result = regressSegment(column, 2, 4); - - assertEquals(2.0, result[0], 1e-10); - assertEquals(1.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_constant() { - double[] column = {3.0, 3.0, 3.0, 3.0}; - double[] result = regressSegment(column, 0, 4); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(3.0, result[1], 1e-10); - } - - @Test - public void testRegressSegment_linear() { - double[] column = new double[4]; - double a = 1.5, b = 2.0; - for (int i = 0; i < 4; i++) { - column[i] = a * i + b; - } - - double[] result = regressSegment(column, 0, 4); - - assertEquals(a, result[0], 1e-10); - assertEquals(b, result[1], 1e-10); - } - - @Test - public void testRegressSegment_denomZero() { - double[] column = {10.0}; - double[] result = regressSegment(column, 0, 1); - - assertEquals(0.0, result[0], 1e-10); - assertEquals(10.0, result[1], 1e-10); - } - - @Test - public void testCompressPiecewiseLinearFunctional_const() { - // 1. MatrixBlock mit einer konstanten Spalte erzeugen - int nrows = 20, ncols = 1; - MatrixBlock in = new MatrixBlock(nrows, ncols, false); - for (int r = 0; r < nrows; r++) - in.set(r, 0, 1.0); - // 2. colIndexes für Spalte 0 - IColIndex colIndexes = ColIndexFactory.create(new int[]{0}); - // 3. CompressionSettings mit TargetLoss - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(1e-6); - // 4. Aufruf der Kompressionsfunktion - AColGroup result = ColGroupFactory.compressPiecewiseLinearFunctional(colIndexes, in, cs); - - // 5. Ergebnis ist eine ColGroupPiecewiseLinearCompressed? - assertTrue(result instanceof ColGroupPiecewiseLinearCompressed); - ColGroupPiecewiseLinearCompressed plGroup = (ColGroupPiecewiseLinearCompressed) result; - - // 6. Breakpoints per Getter, nicht per create() - int[] breakpoints = plGroup.getBreakpoints(); - assertArrayEquals(new int[]{0, 20}, breakpoints); - - // 7. Pro Segment: 1 Segment → ein slope, ein intercept - double[] slopes = plGroup.getSlopes(); - double[] intercepts = plGroup.getIntercepts(); - assertEquals(1, slopes.length); - assertEquals(1, intercepts.length); - - // 8. Für konstante Daten: Steigung ~0, intercept ~1.0 - assertEquals(0.0, slopes[0], 1e-10); - assertEquals(1.0, intercepts[0], 1e-10); - - // 9. Check: colIndexes stimmt - IColIndex idx = plGroup.getColIndices(); - assertEquals(1, idx.size()); - assertEquals(0, idx.get(0)); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_nullBreakpoints() { - int[] nullBp = null; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), nullBp, new double[]{1.0}, new double[]{0.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_tooFewBreakpoints() { - int[] singleBp = {0}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), singleBp, new double[]{1.0}, new double[]{0.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentSlopes() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0, 3.0}, - new double[]{0.0, 1.0}, 10); - } - - @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentIntercepts() { - int[] bp = {0, 5, 10}; - ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, new double[]{1.0, 2.0}, - new double[]{0.0}, 10); - } - - @Test - public void testCreate_validMultiSegment() { - int[] bp = {0, 3, 7, 10}; - double[] slopes = {1.0, -2.0, 0.5}; - double[] intercepts = {0.0, 5.0, -1.0}; - IColIndex cols = ColIndexFactory.create(new int[]{0, 1}); - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 10); - - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertNotSame(bp, ((ColGroupPiecewiseLinearCompressed) cg).getBreakpoints()); - } - - @Test - public void testCreate_multiColumn() { - IColIndex cols = ColIndexFactory.create(new int[]{5, 10, 15}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, 100); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - // - assertTrue(cg.getNumValues() > 0); - - for (int r = 0; r < 5; r++) { - double expected = 3.0 * r + 2.0; - // colIdx=0 → globale Spalte 5 - assertEquals(expected, cg.getIdx(r, 0), 1e-9); - // colIdx=1 → globale Spalte 10 - assertEquals(expected, cg.getIdx(r, 1), 1e-9); - // colIdx=2 → globale Spalte 15 - assertEquals(expected, cg.getIdx(r, 2), 1e-9); - } - - for (int r = 5; r < 10; r++) { - double expected = 3.0 * r + 2.0; - assertEquals(expected, cg.getIdx(r, 0), 1e-9); // Alle Columns gleich - } - assertEquals(cols.size(), 3); - } - - @Test - public void testCreate_singleColumn() { - IColIndex cols = ColIndexFactory.create(new int[]{5}); - int[] bp = {0, 5}; - double[] slopes = {3.0}; - double[] intercepts = {2.0}; - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - assertEquals(2.0, cg.getIdx(0, 0), 1e-9); // 3*0 + 2 - assertEquals(5.0, cg.getIdx(1, 0), 1e-9); // 3*1 + 2 - } - - @Test - public void testCreate_validMinimal() { - - // 1 Segment: [0,10] → y = 2.0 * r + 1.0 - int[] bp = {0, 10}; - double[] slopes = {2.0}; - double[] intercepts = {1.0}; - IColIndex cols = ColIndexFactory.create(new int[]{0}); - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create(cols, bp, slopes, intercepts, numRows); - - // Korrekte Instanz - assertTrue(cg instanceof ColGroupPiecewiseLinearCompressed); - - // getNumValues() > 0 - assertTrue(cg.getNumValues() > 0); - - // r < numRows - for (int r = 0; r < numRows; r++) { - double expected = 2.0 * r + 1.0; - assertEquals("Row " + r, expected, cg.getIdx(r, 0), 1e-9); - } - - // Letzte gültige Row - assertEquals(19.0, cg.getIdx(9, 0), 1e-9); - - //Out-of-Bounds korrekt 0.0 - assertEquals(0.0, cg.getIdx(10, 0), 1e-9); - assertEquals(0.0, cg.getIdx(9, 1), 1e-9); - } - - @Test - public void testDecompressToDenseBlock() { - int[] bp = {0, 5, 10}; - double[] slopes = {1.0, 2.0}; - double[] intercepts = {0.0, 1.0}; - int numRows = 10; - - AColGroup cg = ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); - - // 1. MatrixBlock mit korrekten Dimensionen - MatrixBlock target = new MatrixBlock(numRows, 1, false); - - // 2. DenseBlock ZUERST alloziieren! - target.allocateDenseBlock(); // Oder target.allocateDenseBlock(true); - - // 3. Jetzt DenseBlock verfügbar - DenseBlock db = target.getDenseBlock(); - assertNotNull(db); // Sicherstellen! - - // 4. Dekomprimieren - cg.decompressToDenseBlock(db, 0, numRows, 0, 0); - - // 5. Prüfen - for (int r = 0; r < numRows; r++) { - double expected = (r < 5) ? 1.0 * r : 2.0 * r + 1.0; - assertEquals("Row " + r, expected, db.get(r, 0), 1e-9); - } - } - - private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { - int[] bp = {0, 5, numRows}; - double[] slopes = {1.0, 3.0}; - double[] intercepts = {0.0, 2.0}; - return (ColGroupPiecewiseLinearCompressed) ColGroupPiecewiseLinearCompressed.create( - ColIndexFactory.create(new int[]{0}), bp, slopes, intercepts, numRows); - } - - @Test - public void testDecompressToDenseBlock_fullRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - cg.decompressToDenseBlock(db, 0, 12, 0, 0); - - // Segment 0 [0,5): y = r - assertEquals(0.0, db.get(0, 0), 1e-9); - assertEquals(4.0, db.get(4, 0), 1e-9); - - assertEquals(17.0, db.get(5, 0), 1e-9); - assertEquals(29.0, db.get(9, 0), 1e-9); - assertEquals(32.0, db.get(10, 0), 1e-9); - assertEquals(35.0, db.get(11, 0), 1e-9); - } - - - - @Test - public void testDecompressToDenseBlock_partialRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(12, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - // rl=6, ru=9 → r=6,7,8 dekomprimieren - // offR=0 → schreibt in Target-Rows 6,7,8 - cg.decompressToDenseBlock(db, 6, 9, 0, 0); - - - assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) - assertEquals(20.0, db.get(6, 0), 1e-9); - assertEquals(23.0, db.get(7, 0), 1e-9); - assertEquals(26.0, db.get(8, 0), 1e-9); - assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) - } - - - @Test - public void testDecompressToDenseBlock_emptyRange() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - - MatrixBlock target = new MatrixBlock(5, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); - - // Leerer Bereich - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru - cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru - - // Alles bleibt 0.0 - for (int r = 0; r < 5; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); - } - } - - @Test - public void testDecompressToDenseBlock_nullSafety() { - ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + assertEquals(17.0, db.get(5, 0), 1e-9); + assertEquals(29.0, db.get(9, 0), 1e-9); + assertEquals(32.0, db.get(10, 0), 1e-9); + assertEquals(35.0, db.get(11, 0), 1e-9); + } - // Null DenseBlock - cg.decompressToDenseBlock(null, 0, 10, 0, 0); + @Test + public void testDecompressToDenseBlock_partialRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); - // Ungültige Parameter (leerer Bereich) - MatrixBlock target = new MatrixBlock(10, 1, false); - target.allocateDenseBlock(); - DenseBlock db = target.getDenseBlock(); + MatrixBlock target = new MatrixBlock(12, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // rl=6, ru=9 → r=6,7,8 dekomprimieren + // offR=0 → schreibt in Target-Rows 6,7,8 + cg.decompressToDenseBlock(db, 6, 9, 0, 0); + + assertEquals(0.0, db.get(0, 0), 1e-9); // Unberührt (vor rl=6) + assertEquals(20.0, db.get(6, 0), 1e-9); + assertEquals(23.0, db.get(7, 0), 1e-9); + assertEquals(26.0, db.get(8, 0), 1e-9); + assertEquals(0.0, db.get(9, 0), 1e-9); // Unberührt (nach ru=9) + } - cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru - cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru + @Test + public void testDecompressToDenseBlock_emptyRange() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); + + MatrixBlock target = new MatrixBlock(5, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); + + // Leerer Bereich + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl=ru + cg.decompressToDenseBlock(db, 3, 2, 0, 0); // rl>ru + + // Alles bleibt 0.0 + for(int r = 0; r < 5; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } + + @Test + public void testDecompressToDenseBlock_nullSafety() { + ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); + + // Null DenseBlock + cg.decompressToDenseBlock(null, 0, 10, 0, 0); + + // Ungültige Parameter (leerer Bereich) + MatrixBlock target = new MatrixBlock(10, 1, false); + target.allocateDenseBlock(); + DenseBlock db = target.getDenseBlock(); - // Target unverändert - for (int r = 0; r < 10; r++) { - assertEquals(0.0, db.get(r, 0), 1e-9); - } - } - private CompressedSizeInfo createTestCompressedSizeInfo() { - IColIndex cols = ColIndexFactory.create(new int[]{0}); - EstimationFactors facts = new EstimationFactors(2, 10); + cg.decompressToDenseBlock(db, 12, 12, 0, 0); // rl == ru + cg.decompressToDenseBlock(db, 5, 2, 0, 0); // rl > ru - CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup( - cols, facts, AColGroup.CompressionType.PiecewiseLinear); + // Target unverändert + for(int r = 0; r < 10; r++) { + assertEquals(0.0, db.get(r, 0), 1e-9); + } + } - List infos = Arrays.asList(info); - CompressedSizeInfo csi = new CompressedSizeInfo(infos); + private CompressedSizeInfo createTestCompressedSizeInfo() { + IColIndex cols = ColIndexFactory.create(new int[] {0}); + EstimationFactors facts = new EstimationFactors(2, 10); - return csi; - } + CompressedSizeInfoColGroup info = new CompressedSizeInfoColGroup(cols, facts, + AColGroup.CompressionType.PiecewiseLinear); + + List infos = Arrays.asList(info); + CompressedSizeInfo csi = new CompressedSizeInfo(infos); + + return csi; + } - @Test - public void testCompressPiecewiseLinear_viaRealAPI() { + @Test + public void testCompressPiecewiseLinear_viaRealAPI() { - MatrixBlock in = new MatrixBlock(10, 1, false); - in.allocateDenseBlock(); - for (int r = 0; r < 10; r++) { - in.set(r, 0, r * 0.5); - } + MatrixBlock in = new MatrixBlock(10, 1, false); + in.allocateDenseBlock(); + for(int r = 0; r < 10; r++) { + in.set(r, 0, r * 0.5); + } - CompressionSettings cs = new CompressionSettingsBuilder() - .addValidCompression(AColGroup.CompressionType.PiecewiseLinear) - .create(); + CompressionSettings cs = new CompressionSettingsBuilder().addValidCompression( + AColGroup.CompressionType.PiecewiseLinear).create(); - CompressedSizeInfo csi = createTestCompressedSizeInfo(); + CompressedSizeInfo csi = createTestCompressedSizeInfo(); - List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); + List colGroups = ColGroupFactory.compressColGroups(in, csi, cs); - boolean hasPiecewise = colGroups.stream() - .anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); - assertTrue(hasPiecewise); - } + boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); + assertTrue(hasPiecewise); + } -} \ No newline at end of file +} From 0faa2f830b34c881bb2ee573764dcd6ba1202522 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:23:38 +0100 Subject: [PATCH 13/21] wip: fix formattaing fix: fix efficiency getIdx() --- .../ColGroupPiecewiseLinearCompressed.java | 214 ++++++++++-------- 1 file changed, 118 insertions(+), 96 deletions(-) rename src/main/java/org/apache/sysds/runtime/compress/colgroup/{scheme => }/ColGroupPiecewiseLinearCompressed.java (63%) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java similarity index 63% rename from src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java rename to src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java index 1f39dc44cb0..35891eb8c53 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/scheme/ColGroupPiecewiseLinearCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressed.java @@ -1,9 +1,8 @@ -package org.apache.sysds.runtime.compress.colgroup.scheme; +package org.apache.sysds.runtime.compress.colgroup; -import org.apache.sysds.runtime.compress.colgroup.AColGroup; -import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed; -import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils; +import org.apache.commons.lang3.NotImplementedException; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; +import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme; import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.data.DenseBlock; @@ -34,6 +33,7 @@ protected ColGroupPiecewiseLinearCompressed(IColIndex colIndices) { public ColGroupPiecewiseLinearCompressed(IColIndex colIndexes, int[] breakpoints, double[] slopes, double[] intercepts, int numRows) { super(colIndexes); + this.colIndexes = colIndexes; this.breakpoints = breakpoints; this.slopes = slopes; this.intercepts = intercepts; @@ -60,333 +60,355 @@ public static AColGroup create(IColIndex colIndexes, int[] breakpoints, double[] @Override public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - if(db == null || _colIndexes == null || _colIndexes.size() == 0 || breakpoints == null || slopes == null || + //Safety-Check: + if(db == null || colIndexes == null || colIndexes.size() == 0 || breakpoints == null || slopes == null || intercepts == null) { return; } - - int numSeg = breakpoints.length - 1; - if(numSeg <= 0 || rl >= ru) { + //Validate Segments + int sizeSegment = breakpoints.length - 1; + if(sizeSegment <= 0 || rl >= ru) { return; } - - final int col = _colIndexes.get(0); - - for(int s = 0; s < numSeg; s++) { - int segStart = breakpoints[s]; - int segEnd = breakpoints[s + 1]; + //Find every Segment + final int column = _colIndexes.get(0); + for(int currentSeg = 0; currentSeg < sizeSegment; currentSeg++) { + int segStart = breakpoints[currentSeg]; + int segEnd = breakpoints[currentSeg + 1]; if(segStart >= segEnd) - continue; // Invalid Segment + continue; - double a = slopes[s]; - double b = intercepts[s]; + double currentSlope = slopes[currentSeg]; + double currentIntercepts = intercepts[currentSeg]; - int rs = Math.max(segStart, rl); - int re = Math.min(segEnd, ru); - if(rs >= re) + int rowStart = Math.max(segStart, rl); + int rowEnd = Math.min(segEnd, ru); + if(rowStart >= rowEnd) continue; - for(int r = rs; r < re; r++) { - double yhat = a * r + b; - int gr = offR + r; - int gc = offC + col; + // Filling DenseBlock Matrix + for(int r = rowStart; r < rowEnd; r++) { + double yhat = currentSlope * r + currentIntercepts; + int dbRow = offR + r; + int dbColumn = offC + column; - if(gr >= 0 && gr < db.numRows() && gc >= 0 && gc < db.numCols()) { - db.set(gr, gc, yhat); + if(dbRow >= 0 && dbRow < db.numRows() && dbColumn >= 0 && dbColumn < db.numCols()) { + db.set(dbRow, dbColumn, yhat); } } } } + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } + + @Override + public double getIdx(int r, int colIdx) { + //Check if the rowIDx is valid (safety check) + if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= colIndexes.size()) { + return 0.0; + } + // Using Binary Search for efficient Search for the right Segment ( finding rowIdx r) + // have to use int higherBound = breakpoints.length - 2 because it's the last valid segment + int lowerBound = 0; + int higherBound = breakpoints.length - 2; + while(lowerBound <= higherBound) { + int mid = (lowerBound + higherBound) / 2; + if(r < breakpoints[mid] + 1) { + higherBound = mid - 1; + } + else + lowerBound = mid + 1; + } + int segment = Math.min(lowerBound, breakpoints.length - 2); + + return slopes[segment] * (double) r + intercepts[segment]; + } + + @Override + public int getNumValues() { + return breakpoints.length + slopes.length + intercepts.length; + } + @Override protected double computeMxx(double c, Builtin builtin) { - return 0; + throw new NotImplementedException(); } @Override protected void computeColMxx(double[] c, Builtin builtin) { - + throw new NotImplementedException(); } @Override protected void computeSum(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeSumSq(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeColSumsSq(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeProduct(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) { + throw new NotImplementedException(); } @Override protected void computeColProduct(double[] c, int nRows) { + throw new NotImplementedException(); } @Override protected double[] preAggSumRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggSumSqRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggProductRows() { - return new double[0]; + throw new NotImplementedException(); } @Override protected double[] preAggBuiltinRows(Builtin builtin) { - return new double[0]; + throw new NotImplementedException(); } @Override public boolean sameIndexStructure(AColGroupCompressed that) { - return false; + throw new NotImplementedException(); } @Override protected void tsmm(double[] result, int numColumns, int nRows) { + throw new NotImplementedException(); } @Override public AColGroup copyAndSet(IColIndex colIndexes) { - return null; + throw new NotImplementedException(); } @Override public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) { + throw new NotImplementedException(); } @Override public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) { + throw new NotImplementedException(); } - @Override - public double getIdx(int r, int colIdx) { - // ✅ CRUCIAL: Bounds-Check für colIdx! - if(r < 0 || r >= numRows || colIdx < 0 || colIdx >= _colIndexes.size()) { - return 0.0; - } - - // Segment-Suche (sicher jetzt) - int seg = 0; - for(int i = 1; i < breakpoints.length; i++) { - if(r < breakpoints[i]) { - break; - } - seg = i - 1; // seg < numSeg immer! - } - - return slopes[seg] * (double) r + intercepts[seg]; - } - - @Override - public int getNumValues() { - return breakpoints.length + slopes.length + intercepts.length; - } - @Override public CompressionType getCompType() { - return null; + throw new NotImplementedException(); } @Override protected ColGroupType getColGroupType() { - return null; + throw new NotImplementedException(); } @Override public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { + throw new NotImplementedException(); } @Override public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) { - return null; + throw new NotImplementedException(); } @Override public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) { + throw new NotImplementedException(); } @Override public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) { + throw new NotImplementedException(); } @Override public void tsmmAColGroup(AColGroup other, MatrixBlock result) { + throw new NotImplementedException(); } @Override public AColGroup scalarOperation(ScalarOperator op) { - return null; + throw new NotImplementedException(); } @Override public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; + throw new NotImplementedException(); } @Override public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup sliceSingleColumn(int idx) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) { - return null; + throw new NotImplementedException(); } @Override public AColGroup sliceRows(int rl, int ru) { - return null; + throw new NotImplementedException(); } @Override public boolean containsValue(double pattern) { - return false; + throw new NotImplementedException(); } @Override public long getNumberNonZeros(int nRows) { - return 0; + throw new NotImplementedException(); } @Override public AColGroup replace(double pattern, double replace) { - return null; + throw new NotImplementedException(); } @Override public void computeColSums(double[] c, int nRows) { + throw new NotImplementedException(); } @Override public CmCovObject centralMoment(CMOperator op, int nRows) { - return null; + throw new NotImplementedException(); } @Override public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) { - return null; + throw new NotImplementedException(); } @Override public double getCost(ComputationCostEstimator e, int nRows) { - return 0; + throw new NotImplementedException(); } @Override public AColGroup unaryOperation(UnaryOperator op) { - return null; + throw new NotImplementedException(); } @Override public AColGroup append(AColGroup g) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) { - return null; + throw new NotImplementedException(); } @Override public ICLAScheme getCompressionScheme() { - return null; + throw new NotImplementedException(); } @Override public AColGroup recompress() { - return null; + throw new NotImplementedException(); } @Override public CompressedSizeInfoColGroup getCompressionInfo(int nRow) { - return null; + throw new NotImplementedException(); } @Override protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) { - return null; + throw new NotImplementedException(); } @Override public AColGroup reduceCols() { - return null; + throw new NotImplementedException(); } @Override public double getSparsity() { - return 0; + throw new NotImplementedException(); } @Override protected void sparseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - + throw new NotImplementedException(); } @Override protected void denseSelection(MatrixBlock selection, ColGroupUtils.P[] points, MatrixBlock ret, int rl, int ru) { - + throw new NotImplementedException(); } @Override public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) { - return new AColGroup[0]; - } - - public int[] getBreakpoints() { - return breakpoints; + throw new NotImplementedException(); } - public double[] getSlopes() { - return slopes; - } - - public double[] getIntercepts() { - return intercepts; - } } From 698a942eb27a142a35ebf8bfb780b9e7a6f55143 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:24:41 +0100 Subject: [PATCH 14/21] fix: reverted file --- .../compress/CompressionSettingsBuilder.java | 606 +++++++++--------- 1 file changed, 303 insertions(+), 303 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java index 9af1b5aff2e..02c9f97498d 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java @@ -34,332 +34,332 @@ * Builder pattern for Compression Settings. See CompressionSettings for details on values. */ public class CompressionSettingsBuilder { - private double samplingRatio; - private double samplePower = 0.65; - private boolean allowSharedDictionary = false; - private String transposeInput; - private int seed = -1; - private boolean lossy = false; - private EnumSet validCompressions; - private boolean sortValuesByLength = true; - private int maxColGroupCoCode = 10000; - private double coCodePercentage = 0.01; - private int minimumSampleSize = 3000; - private int maxSampleSize = 1000000; - private EstimationType estimationType = EstimationType.HassAndStokes; - private PartitionerType columnPartitioner; - private CostType costType; - private double minimumCompressionRatio = 1.0; - private boolean isInSparkInstruction = false; - private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; - private double[] scaleFactors = null; - private boolean preferDeltaEncoding = false; + private double samplingRatio; + private double samplePower = 0.65; + private boolean allowSharedDictionary = false; + private String transposeInput; + private int seed = -1; + private boolean lossy = false; + private EnumSet validCompressions; + private boolean sortValuesByLength = true; + private int maxColGroupCoCode = 10000; + private double coCodePercentage = 0.01; + private int minimumSampleSize = 3000; + private int maxSampleSize = 1000000; + private EstimationType estimationType = EstimationType.HassAndStokes; + private PartitionerType columnPartitioner; + private CostType costType; + private double minimumCompressionRatio = 1.0; + private boolean isInSparkInstruction = false; + private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE; + private double[] scaleFactors = null; + private boolean preferDeltaEncoding = false; - public CompressionSettingsBuilder() { + public CompressionSettingsBuilder() { - DMLConfig conf = ConfigurationManager.getDMLConfig(); - this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); - String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); - for(String comp : validCompressionsString) - validCompressions.add(CompressionType.valueOf(comp)); - samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); - columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); - costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); - transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); - seed = DMLScript.SEED; + DMLConfig conf = ConfigurationManager.getDMLConfig(); + this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY); + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.CONST, CompressionType.EMPTY); + String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(","); + for(String comp : validCompressionsString) + validCompressions.add(CompressionType.valueOf(comp)); + samplingRatio = conf.getDoubleValue(DMLConfig.COMPRESSED_SAMPLING_RATIO); + columnPartitioner = PartitionerType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COCODE)); + costType = CostType.valueOf(conf.getTextValue(DMLConfig.COMPRESSED_COST_MODEL)); + transposeInput = conf.getTextValue(DMLConfig.COMPRESSED_TRANSPOSE); + seed = DMLScript.SEED; - } + } - /** - * Sets the scale factors for compression, enabling quantization-fused compression. - * - * @param scaleFactors An array of scale factors applied during compression. - * - If row-wise scaling is used, this should be an array where each value corresponds to a row. - * - If a single scalar is provided, it is applied uniformly to the entire matrix. - * @return The CompressionSettingsBuilder instance with the updated scale factors. - */ - public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { - this.scaleFactors = scaleFactors; - return this; - } + /** + * Sets the scale factors for compression, enabling quantization-fused compression. + * + * @param scaleFactors An array of scale factors applied during compression. + * - If row-wise scaling is used, this should be an array where each value corresponds to a row. + * - If a single scalar is provided, it is applied uniformly to the entire matrix. + * @return The CompressionSettingsBuilder instance with the updated scale factors. + */ + public CompressionSettingsBuilder setScaleFactor(double[] scaleFactors) { + this.scaleFactors = scaleFactors; + return this; + } - /** - * Copy the settings from another CompressionSettings Builder, modifies this, not that. - * - * @param that The other CompressionSettingsBuilder to copy settings from. - * @return The modified CompressionSettings in the same object. - */ - public CompressionSettingsBuilder copySettings(CompressionSettings that) { - this.samplingRatio = that.samplingRatio; - this.allowSharedDictionary = that.allowSharedDictionary; - this.transposeInput = that.transposeInput; - this.seed = that.seed; - this.lossy = that.lossy; - this.validCompressions = EnumSet.copyOf(that.validCompressions); - this.sortValuesByLength = that.sortTuplesByFrequency; - this.columnPartitioner = that.columnPartitioner; - this.maxColGroupCoCode = that.maxColGroupCoCode; - this.coCodePercentage = that.coCodePercentage; - this.minimumSampleSize = that.minimumSampleSize; - this.preferDeltaEncoding = that.preferDeltaEncoding; - return this; - } + /** + * Copy the settings from another CompressionSettings Builder, modifies this, not that. + * + * @param that The other CompressionSettingsBuilder to copy settings from. + * @return The modified CompressionSettings in the same object. + */ + public CompressionSettingsBuilder copySettings(CompressionSettings that) { + this.samplingRatio = that.samplingRatio; + this.allowSharedDictionary = that.allowSharedDictionary; + this.transposeInput = that.transposeInput; + this.seed = that.seed; + this.lossy = that.lossy; + this.validCompressions = EnumSet.copyOf(that.validCompressions); + this.sortValuesByLength = that.sortTuplesByFrequency; + this.columnPartitioner = that.columnPartitioner; + this.maxColGroupCoCode = that.maxColGroupCoCode; + this.coCodePercentage = that.coCodePercentage; + this.minimumSampleSize = that.minimumSampleSize; + this.preferDeltaEncoding = that.preferDeltaEncoding; + return this; + } - /** - * Set the Compression to use Lossy compression. - * - * @param lossy A boolean specifying if the compression should be lossy - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setLossy(boolean lossy) { - this.lossy = lossy; - return this; - } + /** + * Set the Compression to use Lossy compression. + * + * @param lossy A boolean specifying if the compression should be lossy + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setLossy(boolean lossy) { + this.lossy = lossy; + return this; + } - /** - * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 - * - * @param samplingRatio The ratio to sample from the input - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { - this.samplingRatio = samplingRatio; - return this; - } + /** + * Set the sampling ratio in percent to sample the input matrix. Input value should be in range 0.0 - 1.0 + * + * @param samplingRatio The ratio to sample from the input + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSamplingRatio(double samplingRatio) { + this.samplingRatio = samplingRatio; + return this; + } - /** - * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the - * ColGroup. Improving cache efficiency especially for diverse column groups. - * - * @param sortValuesByLength A boolean specifying if the values should be sorted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { - this.sortValuesByLength = sortValuesByLength; - return this; - } + /** + * Set the sortValuesByLength flag. This sorts the dictionaries containing the data based on their occurences in the + * ColGroup. Improving cache efficiency especially for diverse column groups. + * + * @param sortValuesByLength A boolean specifying if the values should be sorted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSortValuesByLength(boolean sortValuesByLength) { + this.sortValuesByLength = sortValuesByLength; + return this; + } - /** - * Allow the Dictionaries to be shared between different column groups. - * - * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { - this.allowSharedDictionary = allowSharedDictionary; - return this; - } + /** + * Allow the Dictionaries to be shared between different column groups. + * + * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) { + this.allowSharedDictionary = allowSharedDictionary; + return this; + } - /** - * Specify if the input matrix should be transposed before compression. This improves cache efficiency while - * compression the input matrix - * - * @param transposeInput string specifying if the input should be transposed before compression, should be one of - * "auto", "true" or "false" - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setTransposeInput(String transposeInput) { - switch(transposeInput) { - case "auto": - case "true": - case "false": - this.transposeInput = transposeInput; - break; - default: - throw new DMLCompressionException("Invalid transpose technique"); - } - return this; - } + /** + * Specify if the input matrix should be transposed before compression. This improves cache efficiency while + * compression the input matrix + * + * @param transposeInput string specifying if the input should be transposed before compression, should be one of + * "auto", "true" or "false" + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setTransposeInput(String transposeInput) { + switch(transposeInput) { + case "auto": + case "true": + case "false": + this.transposeInput = transposeInput; + break; + default: + throw new DMLCompressionException("Invalid transpose technique"); + } + return this; + } - /** - * Set the seed for the compression operation. - * - * @param seed The seed used in sampling the matrix and general operations in the compression. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSeed(int seed) { - this.seed = seed; - return this; - } + /** + * Set the seed for the compression operation. + * + * @param seed The seed used in sampling the matrix and general operations in the compression. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSeed(int seed) { + this.seed = seed; + return this; + } - /** - * Set the valid compression strategies used for the compression. - * - * @param validCompressions An EnumSet of CompressionTypes to use in the compression - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { - // should always contain Uncompressed as an option. - if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) - validCompressions.add(CompressionType.UNCOMPRESSED); - if(!validCompressions.contains(CompressionType.CONST)) - validCompressions.add(CompressionType.CONST); - if(!validCompressions.contains(CompressionType.EMPTY)) - validCompressions.add(CompressionType.EMPTY); - this.validCompressions = validCompressions; - return this; - } + /** + * Set the valid compression strategies used for the compression. + * + * @param validCompressions An EnumSet of CompressionTypes to use in the compression + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setValidCompressions(EnumSet validCompressions) { + // should always contain Uncompressed as an option. + if(!validCompressions.contains(CompressionType.UNCOMPRESSED)) + validCompressions.add(CompressionType.UNCOMPRESSED); + if(!validCompressions.contains(CompressionType.CONST)) + validCompressions.add(CompressionType.CONST); + if(!validCompressions.contains(CompressionType.EMPTY)) + validCompressions.add(CompressionType.EMPTY); + this.validCompressions = validCompressions; + return this; + } - /** - * Add a single valid compression type to the EnumSet of valid compressions. - * - * @param cp The compression type to add to the valid ones. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder addValidCompression(CompressionType cp) { - this.validCompressions.add(cp); - return this; - } + /** + * Add a single valid compression type to the EnumSet of valid compressions. + * + * @param cp The compression type to add to the valid ones. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder addValidCompression(CompressionType cp) { + this.validCompressions.add(cp); + return this; + } - /** - * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. - * Since this is required for operation of the compression - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder clearValidCompression() { - this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); - return this; - } + /** + * Clear all the compression types allowed in the compression. This will only allow the Uncompressed ColGroup type. + * Since this is required for operation of the compression + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder clearValidCompression() { + this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED, CompressionType.EMPTY, CompressionType.CONST); + return this; + } - /** - * Set the type of CoCoding Partitioner type to use for combining columns together. - * - * @param columnPartitioner The Strategy to select from PartitionerType - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { - this.columnPartitioner = columnPartitioner; - return this; - } + /** + * Set the type of CoCoding Partitioner type to use for combining columns together. + * + * @param columnPartitioner The Strategy to select from PartitionerType + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setColumnPartitioner(PartitionerType columnPartitioner) { + this.columnPartitioner = columnPartitioner; + return this; + } - /** - * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with - * higher numbers. - * - * @param maxColGroupCoCode The max selected. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { - this.maxColGroupCoCode = maxColGroupCoCode; - return this; - } + /** + * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with + * higher numbers. + * + * @param maxColGroupCoCode The max selected. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) { + this.maxColGroupCoCode = maxColGroupCoCode; + return this; + } - /** - * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that - * higher values results in more coCoding while lower values result in less. - * - * Note that with high coCoding the compression ratio would possibly be lower. - * - * @param coCodePercentage The percentage to set. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { - this.coCodePercentage = coCodePercentage; - return this; - } + /** + * Set the coCode percentage, the effect is different based on the coCoding strategy, but the general effect is that + * higher values results in more coCoding while lower values result in less. + * + * Note that with high coCoding the compression ratio would possibly be lower. + * + * @param coCodePercentage The percentage to set. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCoCodePercentage(double coCodePercentage) { + this.coCodePercentage = coCodePercentage; + return this; + } - /** - * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is lower than this minimum bound. - * - * @param minimumSampleSize The minimum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { - this.minimumSampleSize = minimumSampleSize; - return this; - } + /** + * Set the minimum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is lower than this minimum bound. + * + * @param minimumSampleSize The minimum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumSampleSize(int minimumSampleSize) { + this.minimumSampleSize = minimumSampleSize; + return this; + } - /** - * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample - * percentage extracted is higher than this maximum bound. - * - * @param maxSampleSize The maximum sample size to extract - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { - this.maxSampleSize = maxSampleSize; - return this; - } + /** + * Set the maximum sample size to extract from a given matrix, this overrules the sample percentage if the sample + * percentage extracted is higher than this maximum bound. + * + * @param maxSampleSize The maximum sample size to extract + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMaxSampleSize(int maxSampleSize) { + this.maxSampleSize = maxSampleSize; + return this; + } - /** - * Set the estimation type used for the sampled estimates. - * - * @param estimationType the estimation type in used. - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { - this.estimationType = estimationType; - return this; - } + /** + * Set the estimation type used for the sampled estimates. + * + * @param estimationType the estimation type in used. + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setEstimationType(EstimationType estimationType) { + this.estimationType = estimationType; + return this; + } - /** - * Set the cost type used for estimating the cost of column groups default is memory based. - * - * @param costType The Cost type wanted - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setCostType(CostType costType) { - this.costType = costType; - return this; - } + /** + * Set the cost type used for estimating the cost of column groups default is memory based. + * + * @param costType The Cost type wanted + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setCostType(CostType costType) { + this.costType = costType; + return this; + } - /** - * Set the minimum compression ratio to be achieved by the compression. - * - * @param ratio The ratio to achieve while compressing - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { - this.minimumCompressionRatio = ratio; - return this; - } + /** + * Set the minimum compression ratio to be achieved by the compression. + * + * @param ratio The ratio to achieve while compressing + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setMinimumCompressionRatio(double ratio) { + this.minimumCompressionRatio = ratio; + return this; + } - /** - * Inform the compression that it is executed in a spark instruction. - * - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setIsInSparkInstruction() { - this.isInSparkInstruction = true; - return this; - } + /** + * Inform the compression that it is executed in a spark instruction. + * + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setIsInSparkInstruction() { + this.isInSparkInstruction = true; + return this; + } - /** - * Set the sort type to use. - * - * @param sdcSortType The sort type for the construction of SDC groups - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { - this.sdcSortType = sdcSortType; - return this; - } + /** + * Set the sort type to use. + * + * @param sdcSortType The sort type for the construction of SDC groups + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) { + this.sdcSortType = sdcSortType; + return this; + } - /** - * Set whether to prefer delta encoding during compression estimation. - * When enabled, the compression estimator will use delta encoding statistics - * instead of regular encoding statistics. - * - * @param preferDeltaEncoding Whether to prefer delta encoding - * @return The CompressionSettingsBuilder - */ - public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { - this.preferDeltaEncoding = preferDeltaEncoding; - return this; - } + /** + * Set whether to prefer delta encoding during compression estimation. + * When enabled, the compression estimator will use delta encoding statistics + * instead of regular encoding statistics. + * + * @param preferDeltaEncoding Whether to prefer delta encoding + * @return The CompressionSettingsBuilder + */ + public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) { + this.preferDeltaEncoding = preferDeltaEncoding; + return this; + } - /** - * Create the CompressionSettings object to use in the compression. - * - * @return The CompressionSettings - */ - public CompressionSettings create() { - return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, - validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, - minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, - sdcSortType, scaleFactors, preferDeltaEncoding); - } -} \ No newline at end of file + /** + * Create the CompressionSettings object to use in the compression. + * + * @return The CompressionSettings + */ + public CompressionSettings create() { + return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy, + validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage, + minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction, + sdcSortType, scaleFactors, preferDeltaEncoding); + } +} From 898af6892e3276303b9f37cc45c40d289ccbbc37 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:26:12 +0100 Subject: [PATCH 15/21] rm: comment reformatted and add targetloss handling --- .../apache/sysds/runtime/compress/CompressionSettings.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java index 7d5a1dac51a..99c4b9c2ecb 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java @@ -136,10 +136,8 @@ public class CompressionSettings { public final boolean preferDeltaEncoding; - /** - * Ziel-Gesantverlust für piecewise Lineace Komocession• Interpretation: maximal entaubter Alobaler MSE pro Went in - * der Sealte. O.O ~ quasi verlustfrei, viele Segmente >0 ~ mehr Approximation entaubt, weniger Segmente - */ + // Handling Targetloss for piecewise linear Kompression + private double piecewiseTargetLoss = Double.NaN; public void setPiecewiseTargetLoss(double piecewiseTargetLoss) { From d8ebc9fd50deb7910036e5eaf234f64c9d6f3f78 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:27:16 +0100 Subject: [PATCH 16/21] fix: reverted file and add enum CompressionTypepiecewiseLinear --- .../runtime/compress/colgroup/AColGroup.java | 232 +++++++++--------- 1 file changed, 114 insertions(+), 118 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java index d761af7667a..e2bf69f5c15 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java @@ -55,7 +55,7 @@ /** * Abstract Class that is the lowest class type for the Compression framework. - * + * * AColGroup store information about a number of columns. * */ @@ -64,7 +64,6 @@ public abstract class AColGroup implements Serializable { private static final long serialVersionUID = -1318908671481L; /** Public super types of compression ColGroups supported */ - // Enum hinzugefügt -> Brauche ich aber auch das im ColGroupType Enum ergänzen? public static enum CompressionType { UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional, PiecewiseLinear; @@ -83,7 +82,7 @@ public boolean isSDC() { /** * Concrete ColGroupType - * + * * Protected such that outside the ColGroup package it should be unknown which specific subtype is used. */ protected static enum ColGroupType { @@ -96,7 +95,7 @@ protected static enum ColGroupType { /** * Main constructor. - * + * * @param colIndices offsets of the columns in the matrix block that make up the group */ protected AColGroup(IColIndex colIndices) { @@ -105,7 +104,7 @@ protected AColGroup(IColIndex colIndices) { /** * Obtain the offsets of the columns in the matrix block that make up the group - * + * * @return offsets of the columns in the matrix block that make up the group */ public final IColIndex getColIndices() { @@ -114,7 +113,7 @@ public final IColIndex getColIndices() { /** * Obtain the number of columns in this column group. - * + * * @return number of columns in this column group */ public final int getNumCols() { @@ -125,9 +124,9 @@ public final int getNumCols() { * Shift all column indexes contained by an offset. * * This is used for rbind to combine compressed matrices. - * + * * Since column indexes are reused between operations, we allocate a new list here to be safe - * + * * @param offset The offset to move all columns * @return A new column group object with the shifted columns */ @@ -139,7 +138,7 @@ public final AColGroup shiftColIndices(int offset) { * Copy the content of the column group with pointers to the previous content but with new column given Note this * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column * groups. - * + * * @param colIndexes the new indexes to use in the copy * @return a new object with pointers to underlying data. */ @@ -147,7 +146,7 @@ public final AColGroup shiftColIndices(int offset) { /** * Get the upper bound estimate of in memory allocation for the column group. - * + * * @return an upper bound on the number of bytes used to store this ColGroup in memory. */ public long estimateInMemorySize() { @@ -158,9 +157,9 @@ public long estimateInMemorySize() { /** * Decompress a range of rows into a sparse block - * + * * Note that this is using append, so the sparse column indexes need to be sorted afterwards. - * + * * @param sb Sparse Target block * @param rl Row to start at * @param ru Row to end at @@ -171,7 +170,7 @@ public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) { /** * Decompress a range of rows into a dense block - * + * * @param db Dense target block * @param rl Row to start at * @param ru Row to end at @@ -182,7 +181,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress a range of rows into a dense transposed block. - * + * * @param db Dense target block * @param rl Row in this column group to start at. * @param ru Row in this column group to end at. @@ -192,7 +191,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Decompress the column group to the sparse transposed block. Note that the column groups would only need to * decompress into specific sub rows of the Sparse block - * + * * @param sb Sparse target block * @param nColOut The number of columns in the sb. */ @@ -200,7 +199,7 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) { /** * Serializes column group to data output. - * + * * @param out data output * @throws IOException if IOException occurs */ @@ -213,7 +212,7 @@ protected void write(DataOutput out) throws IOException { /** * Returns the exact serialized size of column group. This can be used for example for buffer preallocation. - * + * * @return exact serialized size for column group */ public long getExactSizeOnDisk() { @@ -226,11 +225,11 @@ public long getExactSizeOnDisk() { /** * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If * the ColGroup slicing from does not contain any columns within the range null is returned. - * + * * @param cl The lower bound of the columns to select * @param cu The upper bound of the columns to select (not inclusive). * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced - * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. + * dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix. */ public final AColGroup sliceColumns(int cl, int cu) { if(cl <= _colIndexes.get(0) && cu > _colIndexes.get(_colIndexes.size() - 1)) { @@ -248,10 +247,10 @@ else if(cu - cl == 1) /** * Slice out a single column from the column group. - * + * * @param col The column to slice, the column could potentially not be inside the column group * @return A new column group that is a single column, if the column requested is not in this column group null is - * returned. + * returned. */ public final AColGroup sliceColumn(int col) { int idx = _colIndexes.findIndex(col); @@ -263,11 +262,11 @@ public final AColGroup sliceColumn(int col) { /** * Slice out multiple columns within the interval between the given indexes. - * + * * @param cl The lower column index to slice from * @param cu The upper column index to slice to, (not included) * @return A column group of this containing the columns specified, returns null if the columns specified is not - * contained in the column group + * contained in the column group */ protected final AColGroup sliceMultiColumns(int cl, int cu) { SliceResult sr = _colIndexes.slice(cl, cu); @@ -279,7 +278,7 @@ protected final AColGroup sliceMultiColumns(int cl, int cu) { /** * Compute the column sum of the given list of groups - * + * * @param groups The Groups to sum * @param res The result to put the values into * @param nRows The number of rows in the groups @@ -293,9 +292,9 @@ public static double[] colSum(Collection groups, double[] res, int nR /** * Get the value at a global row/column position. - * + * * In general this performs since a binary search of colIndexes is performed for each lookup. - * + * * @param r row * @param c column * @return value at the row/column position @@ -310,7 +309,7 @@ public double get(int r, int c) { /** * Get the value at a colGroup specific row/column index position. - * + * * @param r row * @param colIdx column index in the _colIndexes. * @return value at the row/column index position @@ -319,16 +318,16 @@ public double get(int r, int c) { /** * Obtain number of distinct tuples in contained sets of values associated with this column group. - * + * * If the column group is uncompressed the number or rows is returned. - * + * * @return the number of distinct sets of values associated with the bitmaps in this column group */ public abstract int getNumValues(); /** * Obtain the compression type. - * + * * @return How the elements of the column group are compressed. */ public abstract CompressionType getCompType(); @@ -336,14 +335,14 @@ public double get(int r, int c) { /** * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for * nice switches in the code. - * + * * @return ColGroupType of the object. */ protected abstract ColGroupType getColGroupType(); /** * Decompress into the DenseBlock. (no NNZ handling) - * + * * @param db Target DenseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -354,10 +353,10 @@ public double get(int r, int c) { /** * Decompress into the SparseBlock. (no NNZ handling) - * + * * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted * afterwards - * + * * @param sb Target SparseBlock * @param rl Row to start decompression from * @param ru Row to end decompression at (not inclusive) @@ -368,9 +367,9 @@ public double get(int r, int c) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @return The new Column Group or null that is the result of the matrix multiplication. */ @@ -380,9 +379,9 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right matrix multiplication with this column group. - * + * * This method can return null, meaning that the output overlapping group would have been empty. - * + * * @param right The MatrixBlock on the right of this matrix multiplication * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use * full, can be set to null. @@ -393,7 +392,7 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { /** * Right side Matrix multiplication, iterating though this column group and adding to the ret - * + * * @param right Right side matrix to multiply with. * @param ret The return matrix to add results to * @param rl The row of this column group to multiply from @@ -402,20 +401,18 @@ public final AColGroup rightMultByMatrix(MatrixBlock right) { * @param cru The right hand side column upper * @param nRows The number of rows in this column group */ - public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, - int cru) { - throw new NotImplementedException( - "not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); + public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru){ + throw new NotImplementedException("not supporting right Decompressing Multiply on class: " + this.getClass().getSimpleName()); } /** * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group. - * + * * This gives better performance since there is no need to iterate through all the rows of the matrix, but the * execution can be limited to its number of distinct values. - * + * * Note it only calculate the upper triangle - * + * * @param ret The return matrix block [numColumns x numColumns] * @param nRows The number of rows in the column group */ @@ -423,7 +420,7 @@ public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, i /** * Left multiply with this column group. - * + * * @param matrix The matrix to multiply with on the left * @param result The result to output the values into, always dense for the purpose of the column groups * parallelizing @@ -437,7 +434,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Left side matrix multiplication with a column group that is transposed. - * + * * @param lhs The left hand side Column group to multiply with, the left hand side should be considered * transposed. Also it should be guaranteed that this column group is not empty. * @param result The result matrix to insert the result of the multiplication into @@ -447,16 +444,16 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Matrix multiply with this other column group, but: - * + * * 1. Only output upper triangle values. - * + * * 2. Multiply both ways with "this" being on the left and on the right. - * + * * It should be guaranteed that the input is not the same as the caller of the method. - * + * * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct * locations in the output. - * + * * @param other The other Column group to multiply with * @param result The result matrix to put the results into */ @@ -465,7 +462,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform the specified scalar operation directly on the compressed column group, without decompressing individual * cells if possible. - * + * * @param op operation to perform * @return version of this column group with the operation applied */ @@ -473,7 +470,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -484,7 +481,7 @@ public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock re /** * Short hand add operator call on column group to add a row vector to the column group - * + * * @param v The vector to add * @return A new column group where the vector is added. */ @@ -494,7 +491,7 @@ public AColGroup addVector(double[] v) { /** * Perform a binary row operation. - * + * * @param op The operation to execute * @param v The vector of values to apply the values contained should be at least the length of the highest * value in the column index @@ -506,9 +503,9 @@ public AColGroup addVector(double[] v) { /** * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed * matrix. - * + * * The range of rl to ru only applies to row aggregates. (ReduceCol) - * + * * @param op The operator used * @param c The output matrix block * @param nRows The total number of rows in the Column Group @@ -519,9 +516,9 @@ public AColGroup addVector(double[] v) { /** * Slice out column at specific index of this column group. - * + * * It is guaranteed that the column to slice is contained in this columnGroup. - * + * * @param idx The column index to slice out. * @return A new column group containing the columns inside. (never null) */ @@ -529,9 +526,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of columns inside this column group. - * + * * It is guaranteed that the columns to slice is contained in this columnGroup. - * + * * @param idStart The column index to start at * @param idEnd The column index to end at (not included) * @param outputCols The output columns to extract materialized for ease of implementation @@ -541,10 +538,9 @@ public AColGroup addVector(double[] v) { /** * Slice range of rows out of the column group and return a new column group only containing the row segment. - * - * Note that this slice should maintain pointers back to the original dictionaries and only modify index - * structures. - * + * + * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures. + * * @param rl The row to start at * @param ru The row to end at (not included) * @return A new column group containing the specified row range. @@ -553,21 +549,21 @@ public AColGroup addVector(double[] v) { /** * Short hand method for getting minimum value contained in this column group. - * + * * @return The minimum value contained in this ColumnGroup */ public abstract double getMin(); /** * Short hand method for getting maximum value contained in this column group. - * + * * @return The maximum value contained in this ColumnGroup */ public abstract double getMax(); /** * Short hand method for getting the sum of this column group - * + * * @param nRows The number of rows in the column group * @return The sum of this column group */ @@ -575,7 +571,7 @@ public AColGroup addVector(double[] v) { /** * Detect if the column group contains a specific value. - * + * * @param pattern The value to look for. * @return boolean saying true if the value is contained. */ @@ -583,7 +579,7 @@ public AColGroup addVector(double[] v) { /** * Get the number of nonZeros contained in this column group. - * + * * @param nRows The number of rows in the column group, this is used for groups that does not contain information * about how many rows they have. * @return The nnz. @@ -592,7 +588,7 @@ public AColGroup addVector(double[] v) { /** * Make a copy of the column group values, and replace all values that match pattern with replacement value. - * + * * @param pattern The value to look for * @param replace The value to replace the other value with * @return A new Column Group, reusing the index structure but with new values. @@ -601,7 +597,7 @@ public AColGroup addVector(double[] v) { /** * Compute the column sum - * + * * @param c The array to add the column sum to. * @param nRows The number of rows in the column group. */ @@ -609,7 +605,7 @@ public AColGroup addVector(double[] v) { /** * Central Moment instruction executed on a column group. - * + * * @param op The Operator to use. * @param nRows The number of rows contained in the ColumnGroup. * @return A Central Moment object. @@ -618,7 +614,7 @@ public AColGroup addVector(double[] v) { /** * Expand the column group to multiple columns. (one hot encode the column group) - * + * * @param max The number of columns to expand to and cutoff values at. * @param ignore If zero and negative values should be ignored. * @param cast If the double values contained should be cast to whole numbers. @@ -629,7 +625,7 @@ public AColGroup addVector(double[] v) { /** * Get the computation cost associated with this column group. - * + * * @param e The computation cost estimator * @param nRows the number of rows in the column group * @return The cost of this column group @@ -638,7 +634,7 @@ public AColGroup addVector(double[] v) { /** * Perform unary operation on the column group and return a new column group - * + * * @param op The operation to perform * @return The new column group */ @@ -646,19 +642,19 @@ public AColGroup addVector(double[] v) { /** * Get if the group is only containing zero - * + * * @return true if empty */ public abstract boolean isEmpty(); /** - * Append the other column group to this column group. This method tries to combine them to return a new column - * group containing both. In some cases it is possible in reasonable time, in others it is not. - * + * Append the other column group to this column group. This method tries to combine them to return a new column group + * containing both. In some cases it is possible in reasonable time, in others it is not. + * * The result is first this column group followed by the other column group in higher row values. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param g The other column group * @return A combined column group or null */ @@ -666,9 +662,9 @@ public AColGroup addVector(double[] v) { /** * Append all column groups in the list provided together in one go allocating the output once. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -680,11 +676,11 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Append all column groups in the list provided together with this. - * + * * A Important detail is the first entry in the group == this, and should not be appended twice. - * + * * If it is not possible or very inefficient null is returned. - * + * * @param groups The groups to combine. * @param blen The normal number of rows in the groups * @param rlen The total number of rows of all combined. @@ -694,7 +690,7 @@ public static AColGroup appendN(AColGroup[] groups, int blen, int rlen) { /** * Get the compression scheme for this column group to enable compression of other data. - * + * * @return The compression scheme of this column group */ public abstract ICLAScheme getCompressionScheme(); @@ -708,14 +704,14 @@ public void clear() { /** * Recompress this column group into a new column group. - * + * * @return A new or the same column group depending on optimization goal. */ public abstract AColGroup recompress(); /** * Recompress this column group into a new column group of the given type. - * + * * @param ct The compressionType that the column group should morph into * @param nRow The number of rows in this columngroup. * @return A new column group @@ -745,7 +741,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Get the compression info for this column group. - * + * * @param nRow The number of rows in this column group. * @return The compression info for this group. */ @@ -753,7 +749,7 @@ else if(ct == CompressionType.UNCOMPRESSED) { /** * Combine this column group with another - * + * * @param other The other column group to combine with. * @param nRow The number of rows in both column groups. * @return A combined representation as a column group. @@ -764,7 +760,7 @@ public AColGroup combine(AColGroup other, int nRow) { /** * Get encoding of this column group. - * + * * @return The encoding of the index structure. */ public IEncode getEncoding() { @@ -785,19 +781,19 @@ public AColGroup sortColumnIndexes() { /** * Perform row sum on the internal dictionaries, and return the same index structure. - * + * * This method returns null on empty column groups. - * + * * Note this method does not guarantee correct behavior if the given group is AMorphingGroup, instead it should be * morphed to a valid columngroup via extractCommon first. - * + * * @return The reduced colgroup. */ public abstract AColGroup reduceCols(); /** * Selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The MatrixBlock to decompress the selected rows into @@ -810,17 +806,17 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo else denseSelection(selection, points, ret, rl, ru); } - + /** * Get an approximate sparsity of this column group - * + * * @return the approximate sparsity of this columngroup */ public abstract double getSparsity(); /** * Sparse selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Sparse MatrixBlock to decompress the selected rows into @@ -831,7 +827,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Dense selection (left matrix multiply) - * + * * @param selection A sparse matrix with "max" a single one in each row all other values are zero. * @param points The coordinates in the selection matrix to extract. * @param ret The Dense MatrixBlock to decompress the selected rows into @@ -843,7 +839,7 @@ public final void selectionMultiply(MatrixBlock selection, P[] points, MatrixBlo /** * Method to determine if the columnGroup have the same index structure as another. Note that the column indexes and * dictionaries are allowed to be different. - * + * * @param that the other column group * @return if the index is the same. */ @@ -854,7 +850,7 @@ public boolean sameIndexStructure(AColGroup that) { /** * C bind the list of column groups with this column group. the list of elements provided in the index of each list * is guaranteed to have the same index structures - * + * * @param nRow The number of rows contained in all right and this column group. * @param nCol The number of columns to shift the right hand side column groups over when combining, this should * only effect the column indexes @@ -892,7 +888,7 @@ public AColGroup combineWithSameIndex(int nRow, int nCol, List right) /** * C bind the given column group to this. - * + * * @param nRow The number of rows contained in the right and this column group. * @param nCol The number of columns in this. * @param right The column group to c-bind. @@ -932,16 +928,16 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock. @@ -951,25 +947,25 @@ protected IColIndex combineColIndexes(final int nCol, List right) { /** * This method returns a list of column groups that are naive splits of this column group as if it is reshaped. - * + * * This means the column groups rows are split into x number of other column groups where x is the multiplier. - * + * * The indexes are assigned round robbin to each of the output groups, meaning the first index is assigned. - * + * * If for instance the 4. column group is split by a 2 multiplier and there was 5 columns in total originally. The * output becomes 2 column groups at column index 4 and one at 9. - * + * * If possible the split column groups should reuse pointers back to the original dictionaries! - * + * * This specific variation is pushing down the parallelization given via the executor service provided. If not * overwritten the default is to call the normal split reshape - * + * * @param multiplier The number of column groups to split into * @param nRow The number of rows in this column group in case the underlying column group does not know * @param nColOrg The number of overall columns in the host CompressedMatrixBlock * @param pool The executor service to submit parallel tasks to - * @return a list of split column groups * @throws Exception In case there is an error we throw the exception out instead of handling it + * @return a list of split column groups */ public AColGroup[] splitReshapePushDown(final int multiplier, final int nRow, final int nColOrg, final ExecutorService pool) throws Exception { From 36d318673889247d7abad169b15c5911d75be9c6 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:27:32 +0100 Subject: [PATCH 17/21] fix: reverted file --- .../component/compress/colgroup/ColGroupFactoryTest.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java index a1a5c8a6794..c4da48a0232 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java @@ -19,10 +19,8 @@ package org.apache.sysds.test.component.compress.colgroup; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.computeSegmentCost; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; -import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.Collection; @@ -53,7 +51,6 @@ @RunWith(value = Parameterized.class) public class ColGroupFactoryTest { - private final MatrixBlock mb; private final MatrixBlock mbt; private final ACostEstimate ce; @@ -330,7 +327,5 @@ public boolean isContiguous() { public int numBlocks() { return 2; } - - } } From a0d08d708d66c03fa4df769d073cdf4c3b4837bb Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 22:51:18 +0100 Subject: [PATCH 18/21] fix: repeated compression on every column extract: methods for compressPiecewiseLinearCompression --- .../compress/colgroup/ColGroupFactory.java | 206 +++--------------- 1 file changed, 34 insertions(+), 172 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java index 67f2c492e09..b51111a4aba 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java @@ -43,6 +43,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary; import org.apache.sysds.runtime.compress.colgroup.functional.LinearRegression; +import org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.compress.colgroup.insertionsort.AInsertionSorter; @@ -51,7 +52,6 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory; import org.apache.sysds.runtime.compress.colgroup.offset.AOffset; import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory; -import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.cost.ACostEstimate; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; @@ -306,9 +306,7 @@ else if(ct == CompressionType.LinearFunctional) { } } else if(ct == CompressionType.PiecewiseLinear) { - return compressPiecewiseLinearFunctional(colIndexes, in, cs); - } else if(ct == CompressionType.DDCFOR) { AColGroup g = directCompressDDC(colIndexes, cg); @@ -1074,178 +1072,42 @@ private static AColGroup compressLinearFunctional(IColIndex colIndexes, MatrixBl return ColGroupLinearFunctional.create(colIndexes, coefficients, numRows); } - public static AColGroup compressPiecewiseLinearFunctional(IColIndex colIndexes, MatrixBlock in, - CompressionSettings cs) { - - //Erstmal den Inhalt einer Spalte speichern - - int numRows = in.getNumRows(); - int colIdx = colIndexes.get(0); //Die erste Spalte - double[] column = getColumn(in, colIdx); - - //Sette den Targetloss - - // Breakpoints bestimmen: Einteilung der Segmente - - List breakpointsList = computeBreakpoints(cs, column); - int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); - //Für jedes Segment lineare Regression als kompressionsverfahren - - // 3) Pro Segment Regression -> a,b - int numSeg = breakpoints.length - 1; - double[] slopes = new double[numSeg]; - double[] intercepts = new double[numSeg]; - - for(int s = 0; s < numSeg; s++) { - int start = breakpoints[s]; - int end = breakpoints[s + 1]; - - double[] ab = regressSegment(column, start, end); // nutzt gleiche Stats wie computeSegmentCost - slopes[s] = ab[0]; - intercepts[s] = ab[1]; - } - //Erstelle die Datenstruktur: PiecewiseLinearColGroupCompressed - - return ColGroupPiecewiseLinearCompressed.create(colIndexes, breakpoints, slopes, intercepts, numRows); - } - - public static double[] getColumn(MatrixBlock in, int colIndex) { - int numRows = in.getNumRows(); // Anzahl der Zeilen [web:16] - double[] column = new double[numRows]; // Variable für die Spalte - - for(int r = 0; r < numRows; r++) { - column[r] = in.get(r, colIndex); // Wert (r, colIndex) lesen [web:16][web:25] - } - return column; - } - - public static List computeBreakpoints(CompressionSettings cs, double[] column) { - int n = column.length; - double targetMSE = cs.getPiecewiseTargetLoss(); - // Fall A: kein TargetLoss angegeben -> einfache Variante mit fixem λ - if(Double.isNaN(targetMSE) || targetMSE <= 0) { - double lambda = 5.0; - return computeBreakpointsLambda(column, lambda); - } - - // Fall B: TargetLoss gesetzt -> globales Fehlerbudget berücksichtigen - double sseMax = n * targetMSE; // MSE -> SSE-Budget - - double lambdaMin = 0.0; // viele Segmente, minimaler Fehler - double lambdaMax = 1e6; // wenige Segmente, mehr Fehler - - List bestBreaks = null; - - for(int it = 0; it < 20; it++) { // Binärsuche auf λ - double lambda = 0.5 * (lambdaMin + lambdaMax); - - List breaks = computeBreakpointsLambda(column, lambda); - double totalSSE = computeTotalSSE(column, breaks); - - if(totalSSE <= sseMax) { - // Budget eingehalten: wir können versuchen, mit größerem λ noch weniger Segmente zu nehmen - bestBreaks = breaks; - lambdaMin = lambda; - } - else { - // Fehler zu groß: λ verkleinern, mehr Segmente zulassen - lambdaMax = lambda; - } - } - - if(bestBreaks == null) - bestBreaks = computeBreakpointsLambda(column, lambdaMin); - - return bestBreaks; - } - - public static List computeBreakpointsLambda(double[] column, double lambda) { - int sizeColumn = column.length; - double[] dp = new double[sizeColumn + 1]; - int[] prev = new int[sizeColumn + 1]; - - dp[0] = 0.0; - - for(int index = 1; index <= sizeColumn; index++) { - dp[index] = Double.POSITIVE_INFINITY; - for(int i = 0; i < index; i++) { // Segment [i, index) - double costCurrentSegment = computeSegmentCost(column, i, index); // SSE - double candidateCost = dp[i] + costCurrentSegment + lambda; - if(candidateCost < dp[index]) { - dp[index] = candidateCost; - prev[index] = i; - } + public static AColGroup compressPiecewiseLinearFunctional( + IColIndex colIndexes, MatrixBlock in, CompressionSettings cs) { + + final int numRows = in.getNumRows(); + AColGroup result = null; + + //Compress every column + for (int col = 0; col < colIndexes.size(); col++) { + // get Column Index + IColIndex.SliceResult sliceResult = colIndexes.slice(col, col + 1); + IColIndex singleColIndex = sliceResult.ret; // ← .ret nötig! + + // Get Column from Matrix + final int colIdx = colIndexes.get(col); + double[] column = PiecewiseLinearUtils.getColumn(in, colIdx); + + //Compress column + PiecewiseLinearUtils.SegmentedRegression fit = + PiecewiseLinearUtils.compressSegmentedLeastSquares(column, cs); + + AColGroup singleGroup = ColGroupPiecewiseLinearCompressed.create( + singleColIndex, + fit.getBreakpoints(), + fit.getSlopes(), + fit.getIntercepts(), + numRows); + + // Combine multiple columns + if (result == null) { + result = singleGroup; + } else { + result = result.combineWithSameIndex(numRows, col, singleGroup); } } - List segmentLimits = new ArrayList<>(); - int breakpointIndex = sizeColumn; - while(breakpointIndex > 0) { - segmentLimits.add(breakpointIndex); - breakpointIndex = prev[breakpointIndex]; - } - segmentLimits.add(0); - Collections.sort(segmentLimits); - return segmentLimits; - } - - public static double computeSegmentCost(double[] column, int start, int end) { - int n = end - start; - if(n <= 1) - return 0.0; - - double[] ab = regressSegment(column, start, end); - double slope = ab[0]; - double intercept = ab[1]; - - double sse = 0.0; - for(int i = start; i < end; i++) { - double x = i; - double y = column[i]; - double yhat = slope * x + intercept; - double diff = y - yhat; - sse += diff * diff; - } - return sse; // oder sse / n als MSE - } - - public static double computeTotalSSE(double[] column, List breaks) { - double total = 0.0; - for(int s = 0; s < breaks.size() - 1; s++) { - int start = breaks.get(s); - int end = breaks.get(s + 1); - total += computeSegmentCost(column, start, end); // SSE des Segments - } - return total; - } - - public static double[] regressSegment(double[] column, int start, int end) { - int n = end - start; - if(n <= 0) - return new double[] {0.0, 0.0}; - - double sumX = 0, sumY = 0, sumXX = 0, sumXY = 0; - for(int i = start; i < end; i++) { - double x = i; - double y = column[i]; - sumX += x; - sumY += y; - sumXX += x * x; - sumXY += x * y; - } - - double nD = n; - double denom = nD * sumXX - sumX * sumX; - double slope, intercept; - if(denom == 0) { - slope = 0.0; - intercept = sumY / nD; - } - else { - slope = (nD * sumXY - sumX * sumY) / denom; - intercept = (sumY - slope * sumX) / nD; - } - return new double[] {slope, intercept}; + return result; } private AColGroup compressSDCFromSparseTransposedBlock(IColIndex cols, int nrUniqueEstimate, double tupleSparsity) { From dfe2eee4665ae735b47f37a041433fec709dcbb3 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 22:52:16 +0100 Subject: [PATCH 19/21] add: utils, methods to calculate piecewiseLinearCompression --- .../functional/PiecewiseLinearUtils.java | 253 ++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java new file mode 100644 index 00000000000..7005be9de65 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/functional/PiecewiseLinearUtils.java @@ -0,0 +1,253 @@ +package org.apache.sysds.runtime.compress.colgroup.functional; + +import org.apache.sysds.runtime.compress.CompressionSettings; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class PiecewiseLinearUtils { + + private PiecewiseLinearUtils() { + + } + + public static final class SegmentedRegression { + private final int[] breakpoints; + private final double[] slopes; + private final double[] intercepts; + + public SegmentedRegression(int[] breakpoints, double[] slopes, double[] intercepts) { + this.breakpoints = breakpoints; + this.slopes = slopes; + this.intercepts = intercepts; + } + + public int[] getBreakpoints() { + return breakpoints; + } + + public double[] getSlopes() { + return slopes; + } + + public double[] getIntercepts() { + return intercepts; + } + } + + public static SegmentedRegression compressSegmentedLeastSquares(double[] column, CompressionSettings cs) { + //compute Breakpoints for a Column with dynamic Programming + final List breakpointsList = computeBreakpoints(cs, column); + final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + + //get values for Regression + final int numSeg = breakpoints.length - 1; + final double[] slopes = new double[numSeg]; + final double[] intercepts = new double[numSeg]; + + // Regress per Segment + for (int seg = 0; seg < numSeg; seg++) { + final int SegStart = breakpoints[seg]; + final int SegEnd = breakpoints[seg + 1]; + + final double[] line = regressSegment(column, SegStart, SegEnd); + slopes[seg] = line[0]; //slope regession line + intercepts[seg] = line[1]; //intercept regression line + } + + return new SegmentedRegression(breakpoints, slopes, intercepts); + } + + public static SegmentedRegression compressSegmentedLeastSquaresV2(double[] column, CompressionSettings cs) { + //compute Breakpoints for a Column with Greedy Algorithm + + final List breakpointsList = computeBreakpointsGreedy(column, cs); + final int[] breakpoints = breakpointsList.stream().mapToInt(Integer::intValue).toArray(); + + //get values for Regression + final int numSeg = breakpoints.length - 1; + final double[] slopes = new double[numSeg]; + final double[] intercepts = new double[numSeg]; + + // Regress per Segment + for (int seg = 0; seg < numSeg; seg++) { + final int segstart = breakpoints[seg]; + final int segEnd = breakpoints[seg + 1]; + final double[] line = regressSegment(column, segstart, segEnd); + slopes[seg] = line[0]; + intercepts[seg] = line[1]; + } + return new SegmentedRegression(breakpoints,slopes, intercepts); + } + + public static double[] getColumn(MatrixBlock in, int colIndex) { + final int numRows = in.getNumRows(); + final double[] column = new double[numRows]; + + for (int row = 0; row < numRows; row++) { + column[row] = in.get(row, colIndex); + } + return column; + } + + public static List computeBreakpoints(CompressionSettings cs, double[] column) { + final int numElements = column.length; + final double targetMSE = cs.getPiecewiseTargetLoss(); + + + // TODO: Maybe remove Fallback if no targetloss is given + /*if (Double.isNaN(targetMSE) || targetMSE <= 0) { + final double segmentPenalty = 2.0 * Math.log(numElements); + return computeBreakpointsLambda(column, segmentPenalty); + }*/ + + // max targetloss + final double sseMax = numElements * targetMSE; + double minLoss = 0.0; + double maxLoss = numElements * 100.0; + List bestBreaks = null; + //compute breakpoints + while(maxLoss -minLoss > 1e-8) { + final double currentLoss = 0.5 * (minLoss + maxLoss); + final List breaks = computeBreakpointsLambda(column, currentLoss); + final double totalSSE = computeTotalSSE(column, breaks); + if (totalSSE <= sseMax) { + bestBreaks = breaks; + minLoss = currentLoss; + } + else { + maxLoss = currentLoss; + } + } + + if (bestBreaks == null) + bestBreaks = computeBreakpointsLambda(column, minLoss); + + return bestBreaks; + } + + public static List computeBreakpointsLambda(double[] column, double lambda) { + final int numrows = column.length; + final double[] costs = new double[numrows + 1]; //min Cost + final int[] prevStart = new int[numrows + 1]; //previous Start + costs[0] = 0.0; + // Find Cost + for (int rowEnd = 1; rowEnd <= numrows; rowEnd++) { + costs[rowEnd] = Double.POSITIVE_INFINITY; + //Test all possible Segment to find the lowest costs + for (int rowStart = 0; rowStart < rowEnd; rowStart++) { + //costs = current costs + segmentloss + penaltiy + final double costCurrentSegment = computeSegmentCost(column, rowStart, rowEnd); + final double totalCost = costs[rowStart] + costCurrentSegment + lambda; + // Check if it is the better solution + if (totalCost < costs[rowEnd]) { + costs[rowEnd] = totalCost; + prevStart[rowEnd] = rowStart; + } + } + } + //Check the optimal segmentlimits + final List segmentLimits = new ArrayList<>(); + int breakpointIndex = numrows; + while (breakpointIndex > 0) { + segmentLimits.add(breakpointIndex); + breakpointIndex = prevStart[breakpointIndex]; + } + segmentLimits.add(0); + Collections.sort(segmentLimits); + return segmentLimits; + } + + public static double computeSegmentCost(double[] column, int start, int end) { + final int segSize = end - start; + if (segSize <= 1) + return 0.0; + + final double[] ab = regressSegment(column, start, end); //Regressionline + final double slope = ab[0]; + final double intercept = ab[1]; + + double sumSquaredError = 0.0; + for (int i = start; i < end; i++) { + final double rowIdx = i; + final double actualValue = column[i]; + final double predictedValue = slope * rowIdx + intercept; + final double difference = actualValue - predictedValue; + sumSquaredError += difference * difference; + } + return sumSquaredError; + } + + public static double computeTotalSSE(double[] column, List breaks) { + double total = 0.0; + for (int s = 0; s < breaks.size() - 1; s++) { + final int start = breaks.get(s); + final int end = breaks.get(s + 1); + total += computeSegmentCost(column, start, end); + } + return total; + } + + public static double[] regressSegment(double[] column, int start, int end) { + final int numElements = end - start; + if (numElements <= 0) + return new double[] {0.0, 0.0}; + + double sumOfRowIndices = 0, sumOfColumnValues = 0, sumOfRowIndicesSquared = 0, productRowIndexTimesColumnValue = 0; + for (int i = start; i < end; i++) { + final double x = i; + final double y = column[i]; + sumOfRowIndices += x; + sumOfColumnValues += y; + sumOfRowIndicesSquared += x * x; + productRowIndexTimesColumnValue += x * y; + } + + final double numPointsInSegmentDouble = numElements; + final double denominatorForSlope = numPointsInSegmentDouble * sumOfRowIndicesSquared - sumOfRowIndices * sumOfRowIndices; + final double slope; + final double intercept; + if (denominatorForSlope == 0) { + slope = 0.0; + intercept = sumOfColumnValues / numPointsInSegmentDouble; + } + else { + slope = (numPointsInSegmentDouble * productRowIndexTimesColumnValue - sumOfRowIndices * sumOfColumnValues) / denominatorForSlope; + intercept = (sumOfColumnValues - slope * sumOfRowIndices) / numPointsInSegmentDouble; + } + return new double[] {slope, intercept}; + } + public static List computeBreakpointsGreedy(double[] column, CompressionSettings cs) { + final int numElements = column.length; + final double targetMSE = cs.getPiecewiseTargetLoss(); + if (Double.isNaN(targetMSE) || targetMSE <= 0) { + return Arrays.asList(0, numElements); // Fallback: ein Segment + } + + List breakpoints = new ArrayList<>(); + breakpoints.add(0); + int currentStart = 0; + + while (currentStart < numElements) { + int bestEnd = numElements; // Default: Rest als Segment + for (int end = currentStart + 1; end <= numElements; end++) { + double sse = computeSegmentCost(column, currentStart, end); + double sseMax = (end - currentStart) * targetMSE; + if (sse > sseMax) { + bestEnd = end - 1; // Letzter gültiger Endpunkt + break; + } + } + breakpoints.add(bestEnd); + currentStart = bestEnd; + } + + if (breakpoints.get(breakpoints.size() - 1) != numElements) { + breakpoints.add(numElements); + } + return breakpoints; + } +} From 9e0d18b3c198d4d3e252a4ca7970da9c217fac82 Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Thu, 5 Feb 2026 23:58:48 +0100 Subject: [PATCH 20/21] wip: clear up tests add: test with randomly generated Data --- ...ColGroupPiecewiseLinearCompressedTest.java | 234 +++++++++++------- 1 file changed, 144 insertions(+), 90 deletions(-) rename src/test/java/org/apache/sysds/{runtime => test/component}/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java (76%) diff --git a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java similarity index 76% rename from src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java rename to src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java index 4f309fda967..fa1f88fab98 100644 --- a/src/test/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupPiecewiseLinearCompressedTest.java @@ -1,27 +1,37 @@ -package org.apache.sysds.runtime.compress.colgroup; +package org.apache.sysds.test.component.compress.colgroup; import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.CompressionSettingsBuilder; +import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory; +import org.apache.sysds.runtime.compress.colgroup.ColGroupPiecewiseLinearCompressed; +import org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; -import org.apache.sysds.runtime.compress.colgroup.scheme.ColGroupPiecewiseLinearCompressed; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo; import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup; import org.apache.sysds.runtime.compress.estim.EstimationFactors; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.DataConverter; +import org.apache.sysds.test.AutomatedTestBase; import org.junit.Test; import java.util.Arrays; import java.util.List; -import static org.apache.sysds.runtime.compress.colgroup.ColGroupFactory.*; +import static org.apache.sysds.runtime.compress.colgroup.functional.PiecewiseLinearUtils.*; +import static org.apache.sysds.test.functions.io.binary.BlocksizeTest.sparsity; import static org.junit.Assert.*; -public class ColGroupPiecewiseLinearCompressedTest { +public class ColGroupPiecewiseLinearCompressedTest extends AutomatedTestBase { + @Override + public void setUp() { + + } @Test - public void testComputeBreakpoints_uniformColumn() { + public void testComputeBreakpointsUniformColumn() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; // ← Test-spezifisch @@ -30,7 +40,7 @@ public void testComputeBreakpoints_uniformColumn() { } @Test - public void testComputeBreakpoints_linearIncreasing() { + public void testComputeBreakpointsLinearIncreasing() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; // ← andere column @@ -39,17 +49,10 @@ public void testComputeBreakpoints_linearIncreasing() { } - @Test - public void testComputeBreakpoints_highLoss_uniform() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(10000.0); - double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); - } + @Test - public void testComputeBreakpoints_twoSegments() { + public void testComputeBreakpointsTwoSegments() { CompressionSettings cs = new CompressionSettingsBuilder().create(); cs.setPiecewiseTargetLoss(1e-3); // {1,1,1, 2,2,2} → 2 Segmente → [0,3,6] @@ -58,18 +61,10 @@ public void testComputeBreakpoints_twoSegments() { assertEquals(Arrays.asList(0, 3, 6), breaks); } - @Test - public void testComputeBreakpoints_noLoss_linear() { - CompressionSettings cs = new CompressionSettingsBuilder().create(); - cs.setPiecewiseTargetLoss(0.0); - //cs.setPiecewiseTargetLoss(0.0); - double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; - List breaks = computeBreakpoints(cs, column); - assertEquals(Arrays.asList(0, 5), breaks); // bei 0 Loss alle Breaks - } + @Test - public void testComputeBreakpointsLambda_const() { + public void testComputeBreakpointsLambdaConst() { double[] column = {1.0, 1.0, 1.0, 1.0, 1.0}; List breaks = computeBreakpointsLambda(column, 5.0); assertEquals(Arrays.asList(0, 5), breaks); @@ -79,7 +74,7 @@ public void testComputeBreakpointsLambda_const() { } @Test - public void testComputeBreakpointsLambda_twoSegments() { + public void testComputeBreakpointsLambdaTwoSegments() { double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0}; // 6 Werte // mit kleinem lambda -> viele Segmente (kostenlos fast) @@ -94,7 +89,7 @@ public void testComputeBreakpointsLambda_twoSegments() { } @Test - public void testComputeBreakpointsLambda_jumpWithTrend() { + public void testComputeBreakpointsLambdaJumpWithTrend() { double[] column = {0.0, 1.0, 2.0, 10.0, 11.0, 12.0}; // grobe Segmentanpassung: ein Segment pro „Abschnitt“ @@ -107,7 +102,7 @@ public void testComputeBreakpointsLambda_jumpWithTrend() { } @Test - public void testComputeBreakpointsLambda_linear() { + public void testComputeBreakpointsLambdaLinear() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; List breaks = computeBreakpointsLambda(column, 1.0); @@ -121,7 +116,7 @@ public void testComputeBreakpointsLambda_linear() { } @Test - public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { + public void testComputeBreakpointsLambdaEdgeLambdaVerySmall() { double[] column = {1.0, 1.1, 1.0, 1.1, 1.0}; List breaks = computeBreakpointsLambda(column, 0.001); @@ -137,7 +132,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVerySmall() { } @Test - public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { + public void testComputeBreakpointsLambdaEdgeLambdaVeryLarge() { double[] column = {1.0, 2.0, 1.5, 2.5, 1.8}; List breaks = computeBreakpointsLambda(column, 1000.0); @@ -145,7 +140,7 @@ public void testComputeBreakpointsLambda_edge_lambdaVeryLarge() { } @Test - public void testComputeSegmentCost_emptyOrSingle() { + public void testComputeSegmentCostEmptyOrSingle() { double[] column = {10.0, 20.0, 30.0}; // 0 Elemente (leer) @@ -159,7 +154,7 @@ public void testComputeSegmentCost_emptyOrSingle() { } @Test - public void testComputeSegmentCost_twoConstantPoints() { + public void testComputeSegmentCostTwoConstantPoints() { double[] column = {5.0, 5.0, 1.0, 1.0}; // Zwei identische Punkte (konstant) → SSE = 0 @@ -168,7 +163,7 @@ public void testComputeSegmentCost_twoConstantPoints() { } @Test - public void testComputeSegmentCost_twoDifferentPoints() { + public void testComputeSegmentCostTwoDifferentPoints() { double[] column = {0.0, 2.0, 1.0, 3.0}; // Zwei Punkte: (0,0) und (1,2) → Gerade y = 2*x, Fehler = 0 @@ -181,14 +176,14 @@ public void testComputeSegmentCost_twoDifferentPoints() { } @Test - public void testComputeSegmentCost_constantThree() { + public void testComputeSegmentCostConstantThree() { double[] column = {0.0, 0.0, 0.0}; double sse = computeSegmentCost(column, 0, 3); assertEquals(0.0, sse, 1e-10); } @Test - public void testComputeSegmentCost_consistent_with_regression() { + public void testComputeSegmentCostConsistentWithRegression() { double[] column = {0.0, 2.0, 0.0, 4.0, 0.0, 6.0}; int start = 0, end = 3; @@ -205,30 +200,9 @@ public void testComputeSegmentCost_consistent_with_regression() { assertEquals(sse_hand, sse, 1e-10); } - @Test - public void testComputeTotalSSE_emptyBreaks() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(); // leer → keine Segmente - double total = computeTotalSSE(column, breaks); - - // 0 Segmente → Summe über 0 Segmente = 0 - assertEquals(0.0, total, 1e-10); - } @Test - public void testComputeTotalSSE_singleSegment_all() { - double[] column = {1.0, 2.0, 3.0}; - List breaks = Arrays.asList(0, 3); // ein Segment [0,3) - - double total = computeTotalSSE(column, breaks); - double expected = computeSegmentCost(column, 0, 3); - - // Ergebnis muss exakt das gleiche wie der SSE des gesamten Segments sein - assertEquals(expected, total, 1e-10); - } - - @Test - public void testComputeTotalSSE_twoSegments() { + public void testComputeTotalSSETwoSegments() { // Beispiel: [0,0,0] und [1,1,1] (jeweils konstant) double[] column = {0.0, 0.0, 0.0, 1.0, 1.0, 1.0}; List breaks = Arrays.asList(0, 3, 6); // zwei Segmente @@ -243,7 +217,7 @@ public void testComputeTotalSSE_twoSegments() { } @Test - public void testComputeTotalSSE_threeSegments() { + public void testComputeTotalSSEThreeSegments() { // Ein Segment mit drei identischen Werten, zwei Segmente mit jeweils zwei Werten double[] column = {1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; List breaks = Arrays.asList(0, 3, 5, 7); @@ -263,7 +237,7 @@ public void testComputeTotalSSE_threeSegments() { } @Test - public void testComputeTotalSSE_gapStartEnd() { + public void testComputeTotalSSEGapStartEnd() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; List breaks = Arrays.asList(2, 5, 8); @@ -276,7 +250,7 @@ public void testComputeTotalSSE_gapStartEnd() { } @Test - public void testComputeTotalSSE_oneSegment_identical() { + public void testComputeTotalSSEOneSegmentIdentical() { double[] column = {1.0, 2.0, 3.0, 4.0, 5.0}; double sseTotal = computeSegmentCost(column, 0, 5); @@ -287,7 +261,7 @@ public void testComputeTotalSSE_oneSegment_identical() { } @Test - public void testComputeTotalSSE_nonConstant() { + public void testComputeTotalSSENonConstant() { double[] column = {0.0, 1.0, 2.0, 3.0, 4.0}; List breaks = Arrays.asList(0, 2, 5); @@ -300,7 +274,7 @@ public void testComputeTotalSSE_nonConstant() { } @Test - public void testComputeTotalSSE_edgeCases() { + public void testComputeTotalSSEEdgeCases() { double[] columnEmpty = {}; List breaksEmpty = Arrays.asList(0, 0); assertEquals(0.0, computeTotalSSE(columnEmpty, breaksEmpty), 1e-10); @@ -312,7 +286,7 @@ public void testComputeTotalSSE_edgeCases() { } @Test - public void testRegressSegment_empty() { + public void testRegressSegmentEmpty() { double[] column = {1.0, 2.0, 3.0}; double[] result = regressSegment(column, 0, 0); assertEquals(0.0, result[0], 1e-10); @@ -320,7 +294,7 @@ public void testRegressSegment_empty() { } @Test - public void testRegressSegment_singlePoint() { + public void testRegressSegmentSinglePoint() { double[] column = {1.0, 2.0, 3.0}; double[] result = regressSegment(column, 1, 2); @@ -329,7 +303,7 @@ public void testRegressSegment_singlePoint() { } @Test - public void testRegressSegment_twoIdentical() { + public void testRegressSegmentTwoIdentical() { double[] column = {5.0, 5.0, 1.0, 1.0}; double[] result = regressSegment(column, 0, 2); @@ -338,7 +312,7 @@ public void testRegressSegment_twoIdentical() { } @Test - public void testRegressSegment_twoPoints() { + public void testRegressSegmentTwoPoints() { double[] column = {0.0, 2.0}; double[] result = regressSegment(column, 0, 2); @@ -347,7 +321,7 @@ public void testRegressSegment_twoPoints() { } @Test - public void testRegressSegment_twoPoints_offset() { + public void testRegressSegmentTwoPointsOffset() { double[] column = {1.0, 3.0, 5.0, 7.0}; double[] result = regressSegment(column, 2, 4); @@ -357,7 +331,7 @@ public void testRegressSegment_twoPoints_offset() { } @Test - public void testRegressSegment_constant() { + public void testRegressSegmentConstant() { double[] column = {3.0, 3.0, 3.0, 3.0}; double[] result = regressSegment(column, 0, 4); @@ -366,7 +340,7 @@ public void testRegressSegment_constant() { } @Test - public void testRegressSegment_linear() { + public void testRegressSegmentLinear() { double[] column = new double[4]; double a = 1.5, b = 2.0; for(int i = 0; i < 4; i++) { @@ -379,17 +353,10 @@ public void testRegressSegment_linear() { assertEquals(b, result[1], 1e-10); } - @Test - public void testRegressSegment_denomZero() { - double[] column = {10.0}; - double[] result = regressSegment(column, 0, 1); - assertEquals(0.0, result[0], 1e-10); - assertEquals(10.0, result[1], 1e-10); - } @Test - public void testCompressPiecewiseLinearFunctional_const() { + public void testCompressPiecewiseLinearFunctionalConst() { // 1. MatrixBlock mit einer konstanten Spalte erzeugen int nrows = 20, ncols = 1; MatrixBlock in = new MatrixBlock(nrows, ncols, false); @@ -428,35 +395,35 @@ public void testCompressPiecewiseLinearFunctional_const() { } @Test(expected = IllegalArgumentException.class) - public void testCreate_nullBreakpoints() { + public void testCreateNullBreakpoints() { int[] nullBp = null; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), nullBp, new double[] {1.0}, new double[] {0.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_tooFewBreakpoints() { + public void testCreateTooFewBreakpoints() { int[] singleBp = {0}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), singleBp, new double[] {1.0}, new double[] {0.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentSlopes() { + public void testCreateInconsistentSlopes() { int[] bp = {0, 5, 10}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0, 3.0}, new double[] {0.0, 1.0}, 10); } @Test(expected = IllegalArgumentException.class) - public void testCreate_inconsistentIntercepts() { + public void testCreateInconsistentIntercepts() { int[] bp = {0, 5, 10}; ColGroupPiecewiseLinearCompressed.create(ColIndexFactory.create(new int[] {0}), bp, new double[] {1.0, 2.0}, new double[] {0.0}, 10); } @Test - public void testCreate_validMultiSegment() { + public void testCreateValidMultiSegment() { int[] bp = {0, 3, 7, 10}; double[] slopes = {1.0, -2.0, 0.5}; double[] intercepts = {0.0, 5.0, -1.0}; @@ -469,7 +436,7 @@ public void testCreate_validMultiSegment() { } @Test - public void testCreate_multiColumn() { + public void testCreateMultiColumn() { IColIndex cols = ColIndexFactory.create(new int[] {5, 10, 15}); int[] bp = {0, 5}; double[] slopes = {3.0}; @@ -500,7 +467,7 @@ public void testCreate_multiColumn() { } @Test - public void testCreate_singleColumn() { + public void testCreateSingleColumn() { IColIndex cols = ColIndexFactory.create(new int[] {5}); int[] bp = {0, 5}; double[] slopes = {3.0}; @@ -516,7 +483,7 @@ public void testCreate_singleColumn() { } @Test - public void testCreate_validMinimal() { + public void testCreateValidMinimal() { // 1 Segment: [0,10] → y = 2.0 * r + 1.0 int[] bp = {0, 10}; @@ -586,7 +553,7 @@ private ColGroupPiecewiseLinearCompressed createTestGroup(int numRows) { } @Test - public void testDecompressToDenseBlock_fullRange() { + public void testDecompressToDenseBlockFullRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(12, 1, false); @@ -606,7 +573,7 @@ public void testDecompressToDenseBlock_fullRange() { } @Test - public void testDecompressToDenseBlock_partialRange() { + public void testDecompressToDenseBlockPartialRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(12, 1, false); @@ -625,7 +592,7 @@ public void testDecompressToDenseBlock_partialRange() { } @Test - public void testDecompressToDenseBlock_emptyRange() { + public void testDecompressToDenseBlockEmptyRange() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(12); MatrixBlock target = new MatrixBlock(5, 1, false); @@ -643,7 +610,7 @@ public void testDecompressToDenseBlock_emptyRange() { } @Test - public void testDecompressToDenseBlock_nullSafety() { + public void testDecompressToDenseBlockNullSafety() { ColGroupPiecewiseLinearCompressed cg = createTestGroup(10); // Null DenseBlock @@ -677,7 +644,7 @@ private CompressedSizeInfo createTestCompressedSizeInfo() { } @Test - public void testCompressPiecewiseLinear_viaRealAPI() { + public void testCompressPiecewiseLinearViaRealAPI() { MatrixBlock in = new MatrixBlock(10, 1, false); in.allocateDenseBlock(); @@ -695,5 +662,92 @@ public void testCompressPiecewiseLinear_viaRealAPI() { boolean hasPiecewise = colGroups.stream().anyMatch(cg -> cg instanceof ColGroupPiecewiseLinearCompressed); assertTrue(hasPiecewise); } + @Test + + public void testGreedy_linearColumn_singleSegment() { + // 2. Perfekte Gerade → 1 Segment + double[] linearCol = {1.0, 2.0, 3.0, 4.0, 5.0}; // y=x+1 + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1e-6); + + List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(linearCol, cs); + assertEquals("[0, 5]", breaks.toString()); // SSE=0 ✓ + } + + @Test + public void testGreedy_noisyColumn_multipleSegments() { + // 3. Mit Sprung → 2 Segmente + double[] noisyCol = {1.1, 1.9, 2.2, 10.1, 10.8, 11.3}; // Sprung bei 3 + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); // Erlaubt MSE=1 + + List breaks = PiecewiseLinearUtils.computeBreakpointsGreedy(noisyCol, cs); + // Erwartet mind. 2 Segmente (Sprung erkennen) + assertTrue(breaks.size() >= 3); // [0, ?, 6] + } + + @Test + public void testGreedy_targetLossIncreasesSegments() { + // 4. Höherer Target-Loss → weniger Segmente + double[] colWithJumps = {1,2,3, 10,11,12, 20,21,22}; + CompressionSettings csStrict = new CompressionSettingsBuilder().create(); + csStrict.setPiecewiseTargetLoss(0.01); // Streng → viele Segmente + + CompressionSettings csLoose = new CompressionSettingsBuilder().create(); + csLoose.setPiecewiseTargetLoss(10.0); + + List strictBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csStrict); + List looseBreaks = PiecewiseLinearUtils.computeBreakpointsGreedy(colWithJumps, csLoose); + + // Strenger Target → mehr Segmente + assertTrue(strictBreaks.size() > looseBreaks.size()); + } + + + @Test + public void testMultiColumnTargetLossRespected() { + final int rows = 50, cols = 2; + double[][] data = getRandomMatrix(rows, cols, 0, 10, 1.0, 42L); + MatrixBlock orig = DataConverter.convertToMatrixBlock(data); + orig.allocateDenseBlock(); + + IColIndex colIdx = ColIndexFactory.create(0, cols-1); + CompressionSettings cs = new CompressionSettingsBuilder().create(); + cs.setPiecewiseTargetLoss(1.0); + + AColGroup cg = ColGroupFactory.compressPiecewiseLinearFunctional(colIdx, orig, cs); + + MatrixBlock target = new MatrixBlock(rows, cols, false); + target.allocateDenseBlock(); + cg.decompressToDenseBlock(target.getDenseBlock(), 0, rows-1, 0, cols-1); + + // Test MSE für jede Spalte + for (int c = 0; c < cols; c++) { + double mse = computeColumnMSE(orig, target, c); + assertTrue("Col " + c + " MSE=" + mse + " > target=1.0", mse <= 1.0); + } + } + + + private double computeColumnMSE(MatrixBlock orig, MatrixBlock reconstructed, int colIdx) { + double mse = 0.0; + final int numRows = orig.getNumRows(); + + DenseBlock origDb = orig.getDenseBlock(); + DenseBlock reconDb = reconstructed.getDenseBlock(); + + for (int row = 0; row < numRows; row++) { + final double origValue = origDb.get(row, colIdx); // ← DENSEBLOCK.GET! + final double reconValue = reconDb.get(row, colIdx); + final double squaredError = (origValue - reconValue) * (origValue - reconValue); + mse += squaredError; + } + + return mse / numRows; + } + + + + } From abeced44c770ba6b4f67d182feec958a6720426d Mon Sep 17 00:00:00 2001 From: mori49 <84979219+mori49@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:00:29 +0100 Subject: [PATCH 21/21] fix: revert pom.xml --- pom.xml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index eba29562841..08669868aa1 100644 --- a/pom.xml +++ b/pom.xml @@ -1577,11 +1577,5 @@ fastdoubleparser 0.9.0 - - org.junit.jupiter - junit-jupiter - RELEASE - test - - +