From 81763c4a8208b5212f70cba795b4609cf97b7294 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut <39177575+sevenzees@users.noreply.github.com> Date: Sun, 8 Feb 2026 14:57:15 -0600 Subject: [PATCH 01/17] Add Melt method to DataFrame --- src/Microsoft.Data.Analysis/DataFrame.cs | 209 ++++++++++++++++++ .../DataFrameTests.cs | 119 ++++++++++ 2 files changed, 328 insertions(+) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 9a1fd19523..06c6538833 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; @@ -667,6 +668,214 @@ public DataFrame Append(IEnumerable> row, bool inPl return ret; } + /// + /// Transforms the DataFrame from wide format to long format by unpivoting specified columns. + /// This operation takes multiple value columns and "melts" them into two columns: one containing + /// the original column names (variable) and one containing the values. + /// + /// + /// Column names to use as identifier variables. These columns will be repeated in the output + /// for each value column. Must contain at least one column name. + /// + /// + /// Column names to unpivot into the variable and value columns. If null, all columns not + /// specified in will be used as value columns. + /// + /// + /// Name for the new column that will contain the original value column names. Defaults to "variable". + /// + /// + /// Name for the new column that will contain the values from the unpivoted columns. Defaults to "value". + /// If value columns have different types, this column will be of type string; otherwise, it will + /// match the type of the value columns. + /// + /// + /// If true, rows where the value is null or empty string will be excluded from the result. + /// Defaults to false. + /// + /// + /// A new DataFrame in long format with columns for each ID column, plus the variable and value columns. + /// The number of rows will be approximately (number of original rows × number of value columns), + /// or fewer if is true. + /// + /// + /// Thrown when is empty, when is specified + /// but empty, or when any column appears in both and . + /// + /// + /// Thrown when is null and there are no columns available to use as + /// value columns after excluding the ID columns. + /// + /// + /// + /// // Original DataFrame: + /// // | ID | Name | 2020 | 2021 | 2022 | + /// // |----|-------|------|------|------| + /// // | 1 | Alice | 100 | 110 | 120 | + /// // | 2 | Bob | 200 | 210 | 220 | + /// + /// var melted = df.Melt( + /// idColumns: new[] { "ID", "Name" }, + /// valueColumns: new[] { "2020", "2021", "2022" }, + /// variableName: "Year", + /// valueName: "Sales" + /// ); + /// + /// // Result: + /// // | ID | Name | Year | Sales | + /// // |----|-------|------|-------| + /// // | 1 | Alice | 2020 | 100 | + /// // | 1 | Alice | 2021 | 110 | + /// // | 1 | Alice | 2022 | 120 | + /// // | 2 | Bob | 2020 | 200 | + /// // | 2 | Bob | 2021 | 210 | + /// // | 2 | Bob | 2022 | 220 | + /// + /// + public DataFrame Melt(IEnumerable idColumns, IEnumerable valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false) + { + var idColumnList = idColumns.ToList(); + var valueColumnList = valueColumns?.ToList() + ?? _columnCollection + .Where(c => !idColumnList.Contains(c.Name)) + .Select(c => c.Name) + .ToList(); + + ValidateMeltParameters(idColumnList, valueColumnList, valueColumns); + + long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); + + var outputCols = InitializeIdColumns(idColumnList, totalOutputRows); + var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows); + var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows); + + FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls); + + outputCols.Add(variableColumn); + outputCols.Add(valueColumn); + + return new DataFrame(outputCols); + } + + private void ValidateMeltParameters(List idColumnList, List valueColumnList, IEnumerable valueColumns) + { + if (idColumnList.Count == 0) + { + throw new ArgumentException("Must provide at least 1 ID column", "idColumns"); + } + + if (valueColumns != null && valueColumnList.Count == 0) + { + throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", nameof(valueColumns)); + } + + if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v))) + { + throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns"); + } + + if (valueColumns == null && valueColumnList.Count == 0) + { + throw new InvalidOperationException("There are no columns in the DataFrame to use as value columns after excluding the ID columns"); + } + } + + private long CalculateTotalOutputRows(List valueColumnList, bool dropNulls) + { + if (!dropNulls) + { + return _rowCollection.Count * valueColumnList.Count; + } + + long total = 0; + + foreach (var columnName in valueColumnList) + { + var column = _columnCollection[columnName]; + + foreach (var item in column) + { + if (item is not null and not "") + { + total++; + } + } + } + + return total; + } + + private List InitializeIdColumns(List idColumnList, long size) + { + PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); + var outputCols = new List(idColumnList.Count); + + foreach (var idColumnName in idColumnList) + { + var sourceColumn = _columnCollection[idColumnName]; + var newColumn = sourceColumn.Clone(empty); + newColumn.Resize(size); + outputCols.Add(newColumn); + } + + return outputCols; + } + + private DataFrameColumn CreateValueColumn(List valueColumnList, string valueName, long size) + { + var valueTypes = valueColumnList + .Select(name => _columnCollection[name].GetType()) + .Distinct() + .Count(); + + DataFrameColumn valueColumn; + + if (valueTypes > 1) + { + valueColumn = new StringDataFrameColumn(valueName, size); + } + else + { + PrimitiveDataFrameColumn empty = new PrimitiveDataFrameColumn("Empty"); + valueColumn = _columnCollection[valueColumnList[0]].Clone(empty); + valueColumn.SetName(valueName); + valueColumn.Resize(size); + } + + return valueColumn; + } + + private void FillMeltedData(List idColumnList, List valueColumnList, List outputIdCols, StringDataFrameColumn variableColumn, DataFrameColumn valueColumn, bool dropNulls) + { + bool mixedTypes = valueColumn is StringDataFrameColumn; + long currentRow = 0; + + foreach (var valueColumnName in valueColumnList) + { + var sourceValueColumn = _columnCollection[valueColumnName]; + + for (long sourceRow = 0; sourceRow < _rowCollection.Count; sourceRow++) + { + var value = sourceValueColumn[sourceRow]; + + if (dropNulls && (value is null or "")) + { + continue; + } + + for (int i = 0; i < idColumnList.Count; i++) + { + outputIdCols[i][currentRow] = _columnCollection[idColumnList[i]][sourceRow]; + } + + variableColumn[currentRow] = valueColumnName; + valueColumn[currentRow] = mixedTypes ? value?.ToString() : value; + + currentRow++; + } + } + } + /// /// Invalidates any cached data after a column has changed. /// diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 2d75caef72..c9de1db581 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1392,5 +1392,124 @@ public void Test_StringColumnEqualsNull() Assert.Equal(2, filteredNullDf.Columns["index"][0]); Assert.Equal(5, filteredNullDf.Columns["index"][1]); } + + public static IEnumerable GenerateDataFrameMeltData() + { + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40 }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 1, 2 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 40 }) + ), + new List { "id" }, + new List { "A", "B" }, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40 }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 1, 2 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 40 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 1, 2, 3, 4 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, null, 30, 30, 40, 50, null }) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new StringDataFrameColumn("B", new string[] { "30", "40", "50", null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 1, 2, 3, 4 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B", "B" }), + new StringDataFrameColumn("Value", new string[] { "10", "20", null, "30", "30", "40", "50", null }) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + } + + [Theory] + [MemberData(nameof(GenerateDataFrameMeltData))] + public void TestMelt(DataFrame inputDataFrame, DataFrame outputDataFrame, IEnumerable idColumns, IEnumerable valueColumns, string variableName, string valueName, bool dropNulls) + { + DataFrameAssert.Equal(outputDataFrame, inputDataFrame.Melt(idColumns, valueColumns, variableName, valueName, dropNulls)); + } + + [Fact] + public void TestMelt_InvalidData() + { + DataFrame df = new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ); + + Assert.Throws(() => df.Melt(new string[0], new string[] { "id", "A", "B" })); + + Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, new string[0])); + + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "A", "B" })); + + Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" })); + } } } From 8508ba37a6a75acfa0bd0e8519634d6710dcec3e Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Sun, 8 Feb 2026 19:52:51 -0600 Subject: [PATCH 02/17] Add test for empty string --- .../DataFrameTests.cs | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index c9de1db581..80e95a1db8 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1470,14 +1470,14 @@ public static IEnumerable GenerateDataFrameMeltData() yield return new object[] { new DataFrame( - new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), - new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), - new StringDataFrameColumn("B", new string[] { "30", "40", "50", null }) + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30, 40 }), + new StringDataFrameColumn("B", new string[] { "30", "40", "50", null, "" }) ), new DataFrame( - new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 1, 2, 3, 4 }), - new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B", "B" }), - new StringDataFrameColumn("Value", new string[] { "10", "20", null, "30", "30", "40", "50", null }) + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "A", "B", "B", "B", "B", "B" }), + new StringDataFrameColumn("Value", new string[] { "10", "20", null, "30", "40", "30", "40", "50", null, "" }) ), new List { "id" }, null, @@ -1485,6 +1485,24 @@ public static IEnumerable GenerateDataFrameMeltData() "Value", false, }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4, 5 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30, 40 }), + new StringDataFrameColumn("B", new string[] { "30", "40", "50", null, "" }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 5, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "A", "B", "B", "B" }), + new StringDataFrameColumn("Value", new string[] { "10", "20", "30", "40", "30", "40", "50" }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; } [Theory] From c8c39f6a711131a174add1d20f3654d21a7959e5 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 12:03:09 -0600 Subject: [PATCH 03/17] Remove unused import --- src/Microsoft.Data.Analysis/DataFrame.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 06c6538833..5c8e216299 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; From 27f0e7df51f6546d554f0daabd79a328704a2955 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 12:06:33 -0600 Subject: [PATCH 04/17] Add tests for Melt on empty dataframes --- .../DataFrameTests.cs | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 80e95a1db8..92af117526 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1503,6 +1503,42 @@ public static IEnumerable GenerateDataFrameMeltData() "Value", true, }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new DoubleDataFrameColumn("A", new double?[0]), + new StringDataFrameColumn("B", new string[0]) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new StringDataFrameColumn("Variable", new string[0]), + new StringDataFrameColumn("Value", new string[0]) + ), + new List { "id" }, + null, + "Variable", + "Value", + false, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new DoubleDataFrameColumn("A", new double?[0]), + new StringDataFrameColumn("B", new string[0]) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[0]), + new StringDataFrameColumn("Variable", new string[0]), + new StringDataFrameColumn("Value", new string[0]) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; } [Theory] From 946cde6f320f99228fb274fc06bc1fea23118c98 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:13:50 -0600 Subject: [PATCH 05/17] Treat different column types as the same as long as they have the same underlying data type. --- src/Microsoft.Data.Analysis/DataFrame.cs | 6 ++-- .../DataFrameTests.cs | 36 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 5c8e216299..d1f4919df8 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -685,8 +685,8 @@ public DataFrame Append(IEnumerable> row, bool inPl /// /// /// Name for the new column that will contain the values from the unpivoted columns. Defaults to "value". - /// If value columns have different types, this column will be of type string; otherwise, it will - /// match the type of the value columns. + /// If value columns contain different types, this column will be of type string; otherwise, it will + /// match the type of the first value column. /// /// /// If true, rows where the value is null or empty string will be excluded from the result. @@ -823,7 +823,7 @@ private List InitializeIdColumns(List idColumnList, lon private DataFrameColumn CreateValueColumn(List valueColumnList, string valueName, long size) { var valueTypes = valueColumnList - .Select(name => _columnCollection[name].GetType()) + .Select(name => _columnCollection[name].DataType) .Distinct() .Count(); diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 92af117526..52753cb4c6 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1539,6 +1539,42 @@ public static IEnumerable GenerateDataFrameMeltData() "Value", true, }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new DoubleDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new PrimitiveDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), + new DoubleDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 3, 4 }), + new PrimitiveDataFrameColumn("A", new double?[] { 10, 20, null, 30 }), + new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) + ), + new DataFrame( + new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), + new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), + new PrimitiveDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + ), + new List { "id" }, + null, + "Variable", + "Value", + true, + }; } [Theory] From dd3ac1f44d2830d58f0e4eb94bb6e53f113fb267 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:15:55 -0600 Subject: [PATCH 06/17] Specify paramName in ArgumentException --- src/Microsoft.Data.Analysis/DataFrame.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index d1f4919df8..00301eee32 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -770,7 +770,7 @@ private void ValidateMeltParameters(List idColumnList, List valu if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v))) { - throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns"); + throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", nameof(valueColumns)); } if (valueColumns == null && valueColumnList.Count == 0) From 92a445041e84e8212a64c86ce95087e983ab79aa Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:18:55 -0600 Subject: [PATCH 07/17] Use nameof() in ArgumentExceptions --- src/Microsoft.Data.Analysis/DataFrame.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 00301eee32..c315997c57 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -740,7 +740,7 @@ public DataFrame Melt(IEnumerable idColumns, IEnumerable valueCo .Select(c => c.Name) .ToList(); - ValidateMeltParameters(idColumnList, valueColumnList, valueColumns); + ValidateMeltParameters(idColumnList, valueColumnList, valueColumns, nameof(idColumns), nameof(valueColumns)); long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); @@ -756,21 +756,21 @@ public DataFrame Melt(IEnumerable idColumns, IEnumerable valueCo return new DataFrame(outputCols); } - private void ValidateMeltParameters(List idColumnList, List valueColumnList, IEnumerable valueColumns) + private void ValidateMeltParameters(List idColumnList, List valueColumnList, IEnumerable valueColumns, string idColumnsName, string valueColumnsName) { if (idColumnList.Count == 0) { - throw new ArgumentException("Must provide at least 1 ID column", "idColumns"); + throw new ArgumentException("Must provide at least 1 ID column", idColumnsName); } if (valueColumns != null && valueColumnList.Count == 0) { - throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", nameof(valueColumns)); + throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", valueColumnsName); } if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v))) { - throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", nameof(valueColumns)); + throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", valueColumnsName); } if (valueColumns == null && valueColumnList.Count == 0) From eac1478dc7bd0f5918611a6051303cca8ee198d2 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:21:54 -0600 Subject: [PATCH 08/17] Moved validation out of separate method to allow for nameof() use. --- src/Microsoft.Data.Analysis/DataFrame.cs | 38 +++++++++++------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index c315997c57..1c920ec267 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -733,6 +733,7 @@ public DataFrame Append(IEnumerable> row, bool inPl /// public DataFrame Melt(IEnumerable idColumns, IEnumerable valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false) { + var idColumnList = idColumns.ToList(); var valueColumnList = valueColumns?.ToList() ?? _columnCollection @@ -740,43 +741,38 @@ public DataFrame Melt(IEnumerable idColumns, IEnumerable valueCo .Select(c => c.Name) .ToList(); - ValidateMeltParameters(idColumnList, valueColumnList, valueColumns, nameof(idColumns), nameof(valueColumns)); - - long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); - - var outputCols = InitializeIdColumns(idColumnList, totalOutputRows); - var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows); - var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows); - - FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls); - - outputCols.Add(variableColumn); - outputCols.Add(valueColumn); - - return new DataFrame(outputCols); - } - - private void ValidateMeltParameters(List idColumnList, List valueColumnList, IEnumerable valueColumns, string idColumnsName, string valueColumnsName) - { if (idColumnList.Count == 0) { - throw new ArgumentException("Must provide at least 1 ID column", idColumnsName); + throw new ArgumentException("Must provide at least 1 ID column", nameof(idColumns)); } if (valueColumns != null && valueColumnList.Count == 0) { - throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", valueColumnsName); + throw new ArgumentException("Must provide at least 1 value column when specifying value columns manually", nameof(valueColumns)); } if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v))) { - throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", valueColumnsName); + throw new ArgumentException("Columns cannot exist in both idColumns and valueColumns", nameof(valueColumns)); } if (valueColumns == null && valueColumnList.Count == 0) { throw new InvalidOperationException("There are no columns in the DataFrame to use as value columns after excluding the ID columns"); } + + long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); + + var outputCols = InitializeIdColumns(idColumnList, totalOutputRows); + var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows); + var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows); + + FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls); + + outputCols.Add(variableColumn); + outputCols.Add(valueColumn); + + return new DataFrame(outputCols); } private long CalculateTotalOutputRows(List valueColumnList, bool dropNulls) From fcfa2dc6062630080bfb9fa65910cae84102e291 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:22:55 -0600 Subject: [PATCH 09/17] Guard against possible null idColumns parameter --- src/Microsoft.Data.Analysis/DataFrame.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 1c920ec267..366940244e 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -734,7 +734,7 @@ public DataFrame Append(IEnumerable> row, bool inPl public DataFrame Melt(IEnumerable idColumns, IEnumerable valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false) { - var idColumnList = idColumns.ToList(); + var idColumnList = idColumns?.ToList() ?? new List(); var valueColumnList = valueColumns?.ToList() ?? _columnCollection .Where(c => !idColumnList.Contains(c.Name)) From 1b12cf885715ec19861580d790865085fd088fed Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:25:36 -0600 Subject: [PATCH 10/17] Do not allow missing variableName or valueName parameters --- src/Microsoft.Data.Analysis/DataFrame.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 366940244e..50d51794c8 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -733,6 +733,15 @@ public DataFrame Append(IEnumerable> row, bool inPl /// public DataFrame Melt(IEnumerable idColumns, IEnumerable valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false) { + if (string.IsNullOrWhiteSpace(variableName)) + { + throw new ArgumentException("Parameter must not be null, empty, or whitespace", nameof(variableName)); + } + + if (string.IsNullOrWhiteSpace(valueName)) + { + throw new ArgumentException("Parameter must not be null, empty, or whitespace", nameof(valueName)); + } var idColumnList = idColumns?.ToList() ?? new List(); var valueColumnList = valueColumns?.ToList() From 1814aa554fecda8a60a2257fa8aa26802f290296 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:33:01 -0600 Subject: [PATCH 11/17] Add validation for column names that match existing column names --- src/Microsoft.Data.Analysis/DataFrame.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 50d51794c8..cad923c1ac 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -770,6 +770,18 @@ public DataFrame Melt(IEnumerable idColumns, IEnumerable valueCo throw new InvalidOperationException("There are no columns in the DataFrame to use as value columns after excluding the ID columns"); } + IEnumerable existingColumnNames = _columnCollection.Select(c => c.Name); + + if (existingColumnNames.Contains(variableName)) + { + throw new ArgumentException($"Variable name '{variableName}' matches an existing column name", nameof(variableName)); + } + + if (existingColumnNames.Contains(valueName)) + { + throw new ArgumentException($"Value name '{valueName}' matches an existing column name", nameof(valueName)); + } + long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls); var outputCols = InitializeIdColumns(idColumnList, totalOutputRows); From 093fc4d17c87dbf257027309e796f23e64e85787 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:33:04 -0600 Subject: [PATCH 12/17] Add more tests for invalid data --- .../DataFrameTests.cs | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 52753cb4c6..0448cb87dc 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1593,12 +1593,32 @@ public void TestMelt_InvalidData() new DoubleDataFrameColumn("B", new double?[] { 30, 40, 50, null }) ); + // No id columns Assert.Throws(() => df.Melt(new string[0], new string[] { "id", "A", "B" })); + // No value columns Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, new string[0])); + // Id column is also value column Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "A", "B" })); + // Value name is null, empty, or whitespace + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: null)); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: "")); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: " \r\n\t")); + + // Variable name is null, empty, or whitespace + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: null)); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: "")); + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: " \r\n\t")); + + // Value name matches an existing column name in the DataFrame + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, valueName: "B")); + + // Variable name matches an existing column name in the DataFrame + Assert.Throws(() => df.Melt(new string[] { "id", "A" }, new string[] { "B" }, variableName: "B")); + + // There are no columns in the DataFrame to use as value columns after excluding the ID columns Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" })); } } From 35dab600f61085502088ff13dc0f806b479e7d61 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:41:47 -0600 Subject: [PATCH 13/17] Use HashSet in case the DataFrame has many columns --- src/Microsoft.Data.Analysis/DataFrame.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index cad923c1ac..d93d30f0dd 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -744,9 +744,17 @@ public DataFrame Melt(IEnumerable idColumns, IEnumerable valueCo } var idColumnList = idColumns?.ToList() ?? new List(); + + HashSet idColumnSet = null; + + if (valueColumns is null) + { + idColumnSet = [.. idColumnList]; + } + var valueColumnList = valueColumns?.ToList() ?? _columnCollection - .Where(c => !idColumnList.Contains(c.Name)) + .Where(c => !idColumnSet.Contains(c.Name)) .Select(c => c.Name) .ToList(); From ad7257e84b1bbfb7d4d7d0ce1209516648cb5383 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:43:31 -0600 Subject: [PATCH 14/17] Add tests for null idColumns and valueColumns parameters --- test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 0448cb87dc..bceef6c2d5 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1594,9 +1594,11 @@ public void TestMelt_InvalidData() ); // No id columns + Assert.Throws(() => df.Melt(null, new string[] { "id", "A", "B" })); Assert.Throws(() => df.Melt(new string[0], new string[] { "id", "A", "B" })); // No value columns + Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, null)); Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, new string[0])); // Id column is also value column From ddd5daa28c0fad48212b3d11b3a75617e2f1c8bb Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:50:47 -0600 Subject: [PATCH 15/17] Remove bad test --- test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index bceef6c2d5..7534a68dac 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1598,7 +1598,6 @@ public void TestMelt_InvalidData() Assert.Throws(() => df.Melt(new string[0], new string[] { "id", "A", "B" })); // No value columns - Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, null)); Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" }, new string[0])); // Id column is also value column From a1995762cb50fb9f74866240540be0f52714baeb Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:51:05 -0600 Subject: [PATCH 16/17] Test default parameter values --- .../DataFrameTests.cs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 7534a68dac..211f4a06b9 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1566,13 +1566,13 @@ public static IEnumerable GenerateDataFrameMeltData() ), new DataFrame( new Int32DataFrameColumn("id", new int?[] { 1, 2, 4, 1, 2, 3 }), - new StringDataFrameColumn("Variable", new string[] { "A", "A", "A", "B", "B", "B" }), - new PrimitiveDataFrameColumn("Value", new double?[] { 10, 20, 30, 30, 40, 50 }) + new StringDataFrameColumn("TestVar", new string[] { "A", "A", "A", "B", "B", "B" }), + new PrimitiveDataFrameColumn("TestVal", new double?[] { 10, 20, 30, 30, 40, 50 }) ), new List { "id" }, null, - "Variable", - "Value", + "TestVar", + "TestVal", true, }; } @@ -1621,6 +1621,12 @@ public void TestMelt_InvalidData() // There are no columns in the DataFrame to use as value columns after excluding the ID columns Assert.Throws(() => df.Melt(new string[] { "id", "A", "B" })); + + // Test default values for variableName, valueName, and dropNulls parameters + DataFrame melted = df.Melt(new string[] { "id" }, new string[] { "A" }); + Assert.True(melted.Columns.IndexOf("variable") >= 0); + Assert.True(melted.Columns.IndexOf("value") >= 0); + Assert.Equal(4, melted.Rows.Count); } } } From a3c50028b045cd6ceb8c7fcceb48f4f2f9b09064 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Mon, 9 Feb 2026 15:58:37 -0600 Subject: [PATCH 17/17] Cache idColumns to avoid repeated lookups --- src/Microsoft.Data.Analysis/DataFrame.cs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index d93d30f0dd..bee475b108 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -873,12 +873,20 @@ private void FillMeltedData(List idColumnList, List valueColumnL { bool mixedTypes = valueColumn is StringDataFrameColumn; long currentRow = 0; + long rowCount = _rowCollection.Count; + int idColumnCount = idColumnList.Count; + + var idColumns = new DataFrameColumn[idColumnCount]; + for (int i = 0; i < idColumnCount; i++) + { + idColumns[i] = _columnCollection[idColumnList[i]]; + } foreach (var valueColumnName in valueColumnList) { var sourceValueColumn = _columnCollection[valueColumnName]; - for (long sourceRow = 0; sourceRow < _rowCollection.Count; sourceRow++) + for (long sourceRow = 0; sourceRow < rowCount; sourceRow++) { var value = sourceValueColumn[sourceRow]; @@ -887,14 +895,13 @@ private void FillMeltedData(List idColumnList, List valueColumnL continue; } - for (int i = 0; i < idColumnList.Count; i++) + for (int i = 0; i < idColumnCount; i++) { - outputIdCols[i][currentRow] = _columnCollection[idColumnList[i]][sourceRow]; + outputIdCols[i][currentRow] = idColumns[i][sourceRow]; } variableColumn[currentRow] = valueColumnName; valueColumn[currentRow] = mixedTypes ? value?.ToString() : value; - currentRow++; } }