Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
81763c4
Add Melt method to DataFrame
sevenzees Feb 8, 2026
8508ba3
Add test for empty string
sevenzees Feb 9, 2026
c8c39f6
Remove unused import
sevenzees Feb 9, 2026
27f0e7d
Add tests for Melt on empty dataframes
sevenzees Feb 9, 2026
946cde6
Treat different column types as the same as long as they have the sam…
sevenzees Feb 9, 2026
dd3ac1f
Specify paramName in ArgumentException
sevenzees Feb 9, 2026
92a4450
Use nameof() in ArgumentExceptions
sevenzees Feb 9, 2026
eac1478
Moved validation out of separate method to allow for nameof() use.
sevenzees Feb 9, 2026
fcfa2dc
Guard against possible null idColumns parameter
sevenzees Feb 9, 2026
1b12cf8
Do not allow missing variableName or valueName parameters
sevenzees Feb 9, 2026
1814aa5
Add validation for column names that match existing column names
sevenzees Feb 9, 2026
093fc4d
Add more tests for invalid data
sevenzees Feb 9, 2026
35dab60
Use HashSet in case the DataFrame has many columns
sevenzees Feb 9, 2026
ad7257e
Add tests for null idColumns and valueColumns parameters
sevenzees Feb 9, 2026
ddd5daa
Remove bad test
sevenzees Feb 9, 2026
a199576
Test default parameter values
sevenzees Feb 9, 2026
a3c5002
Cache idColumns to avoid repeated lookups
sevenzees Feb 9, 2026
40cdf76
Merge branch 'dotnet:main' into main
sevenzees Mar 19, 2026
60daf6c
Refactor Melt exceptions to use string resources
sevenzees Mar 19, 2026
118493d
Use ArgumentNullException for null idColumns parameter
sevenzees Mar 19, 2026
c206035
Optimize column name check
sevenzees Mar 19, 2026
52efdae
Require variableName and valueName to be different
sevenzees Mar 19, 2026
91b1f1f
Test for ID and value columns that do not exist
sevenzees Mar 19, 2026
88ca1e4
Do not drop empty strings with dropNulls = true
sevenzees Mar 19, 2026
d29129e
Rename mixed types to convertToString
sevenzees Mar 19, 2026
734f348
Add remark to XML doc for melt
sevenzees Mar 19, 2026
a26437d
Add test case with multiple id columns
sevenzees Mar 19, 2026
08779a2
Remove unused import
sevenzees Mar 20, 2026
e400422
Remove dead code in Melt
sevenzees Mar 20, 2026
46e6924
Fix example output for Melt
sevenzees Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 269 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,275 @@ public DataFrame Append(IEnumerable<KeyValuePair<string, object>> row, bool inPl
return ret;
}

/// <summary>
/// Transforms the DataFrame from wide format to long format by unpivoting specified columns.
/// This operation takes multiple value columns and "melts" them into two columns: one containing
/// the original column names (variable) and one containing the values.
/// </summary>
/// <param name="idColumns">
/// Column names to use as identifier variables. These columns will be repeated in the output
/// for each value column. Must contain at least one column name.
/// </param>
/// <param name="valueColumns">
/// Column names to unpivot into the variable and value columns. If null, all columns not
/// specified in <paramref name="idColumns"/> will be used as value columns.
/// </param>
/// <param name="variableName">
/// Name for the new column that will contain the original value column names. Defaults to "variable".
/// </param>
/// <param name="valueName">
/// Name for the new column that will contain the values from the unpivoted columns. Defaults to "value".
/// If value columns contain different types, this column will be of type string; otherwise, it will
/// match the type of the first value column.
/// </param>
/// <param name="dropNulls">
/// If true, rows where the value is null will be excluded from the result.
/// Defaults to false.
/// </param>
/// <returns>
/// A new DataFrame in long format with columns for each ID column, plus the variable and value columns.
/// The number of rows will be approximately (number of original rows × number of value columns),
/// or fewer if <paramref name="dropNulls"/> is true.
/// </returns>
/// <exception cref="ArgumentException">
/// Thrown when <paramref name="idColumns"/> is empty, when <paramref name="valueColumns"/> is specified
/// but empty, or when any column appears in both <paramref name="idColumns"/> and <paramref name="valueColumns"/>.
/// </exception>
/// <exception cref="InvalidOperationException">
/// Thrown when <paramref name="valueColumns"/> is null and there are no columns available to use as
/// value columns after excluding the ID columns.
/// </exception>
/// <example>
/// <code>
/// // Original DataFrame:
/// // | ID | Name | 2020 | 2021 | 2022 |
/// // |----|-------|------|------|------|
/// // | 1 | Alice | 100 | 110 | 120 |
/// // | 2 | Bob | 200 | 210 | 220 |
///
/// var melted = df.Melt(
/// idColumns: new[] { "ID", "Name" },
/// valueColumns: new[] { "2020", "2021", "2022" },
/// variableName: "Year",
/// valueName: "Sales"
/// );
///
/// // Result:
/// // | ID | Name | Year | Sales |
/// // |----|-------|------|-------|
/// // | 1 | Alice | 2020 | 100 |
/// // | 2 | Bob | 2020 | 200 |
/// // | 1 | Alice | 2021 | 110 |
/// // | 2 | Bob | 2021 | 210 |
/// // | 1 | Alice | 2022 | 120 |
/// // | 2 | Bob | 2022 | 220 |
/// </code>
/// </example>
/// <remarks>
/// Note: The output rows are ordered by value column (all rows for the first value column,
/// then all rows for the second, etc.), which differs from pandas.melt() which orders by
/// source row.
/// </remarks>
public DataFrame Melt(IEnumerable<string> idColumns, IEnumerable<string> valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false)
{
if (string.IsNullOrWhiteSpace(variableName))
{
throw new ArgumentException(Strings.ParameterMustNotBeNullOrWhitespace, nameof(variableName));
}

if (string.IsNullOrWhiteSpace(valueName))
{
throw new ArgumentException(Strings.ParameterMustNotBeNullOrWhitespace, nameof(valueName));
}

if (idColumns == null)
{
throw new ArgumentNullException(nameof(idColumns));
}

var idColumnList = idColumns.ToList();

HashSet<string> idColumnSet = null;

if (valueColumns is null)
{
idColumnSet = [.. idColumnList];
}

var valueColumnList = valueColumns?.ToList()
?? _columnCollection
.Where(c => !idColumnSet.Contains(c.Name))
.Select(c => c.Name)
.ToList();

if (idColumnList.Count == 0)
{
throw new ArgumentException(Strings.MissingIdColumns, nameof(idColumns));
}

if (valueColumns != null && valueColumnList.Count == 0)
{
throw new ArgumentException(Strings.MissingValueColumns, nameof(valueColumns));
}

if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v)))
{
throw new ArgumentException(Strings.DuplicateColumnsInIdAndValueLists, nameof(valueColumns));
}

if (valueColumns == null && valueColumnList.Count == 0)
{
throw new InvalidOperationException(Strings.NoValueColumnsRemaining);
}

if (_columnCollection.IndexOf(variableName) >= 0)
{
throw new ArgumentException(string.Format(Strings.VariableNameAlreadyExists, variableName), nameof(variableName));
}

if (_columnCollection.IndexOf(valueName) >= 0)
{
throw new ArgumentException(string.Format(Strings.ValueNameAlreadyExists, valueName), nameof(valueName));
}

if (string.Equals(variableName, valueName))
{
throw new ArgumentException(string.Format(Strings.VariableNameAndValueNameMustBeDifferent, nameof(variableName), nameof(valueName)), nameof(valueName));
}

foreach (var columnName in idColumnList)
{
if (_columnCollection.IndexOf(columnName) < 0)
{
throw new ArgumentException(string.Format(Strings.InvalidColumnName, columnName), nameof(idColumns));
}
}

foreach (var columnName in valueColumnList)
{
if (_columnCollection.IndexOf(columnName) < 0)
{
throw new ArgumentException(string.Format(Strings.InvalidColumnName, columnName), nameof(valueColumns));
}
}

long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls);

var outputCols = InitializeIdColumns(idColumnList, totalOutputRows);
var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows);
var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows);

FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls);

outputCols.Add(variableColumn);
outputCols.Add(valueColumn);

return new DataFrame(outputCols);
}

private long CalculateTotalOutputRows(List<string> valueColumnList, bool dropNulls)
{
if (!dropNulls)
{
return _rowCollection.Count * valueColumnList.Count;
}

long total = 0;

foreach (var columnName in valueColumnList)
{
var column = _columnCollection[columnName];

foreach (var item in column)
{
if (item != null)
{
total++;
}
}
}

return total;
}

private List<DataFrameColumn> InitializeIdColumns(List<string> idColumnList, long size)
{
PrimitiveDataFrameColumn<long> empty = new PrimitiveDataFrameColumn<long>("Empty");
var outputCols = new List<DataFrameColumn>(idColumnList.Count);

foreach (var idColumnName in idColumnList)
{
var sourceColumn = _columnCollection[idColumnName];
var newColumn = sourceColumn.Clone(empty);
newColumn.Resize(size);
outputCols.Add(newColumn);
}

return outputCols;
}

private DataFrameColumn CreateValueColumn(List<string> valueColumnList, string valueName, long size)
{
var valueTypes = valueColumnList
.Select(name => _columnCollection[name].DataType)
.Distinct()
.Count();

DataFrameColumn valueColumn;

if (valueTypes > 1)
{
valueColumn = new StringDataFrameColumn(valueName, size);
}
else
{
PrimitiveDataFrameColumn<long> empty = new PrimitiveDataFrameColumn<long>("Empty");
valueColumn = _columnCollection[valueColumnList[0]].Clone(empty);
valueColumn.SetName(valueName);
valueColumn.Resize(size);
}

return valueColumn;
}

private void FillMeltedData(List<string> idColumnList, List<string> valueColumnList, List<DataFrameColumn> outputIdCols, StringDataFrameColumn variableColumn, DataFrameColumn valueColumn, bool dropNulls)
{
bool convertToString = valueColumn is StringDataFrameColumn;
long currentRow = 0;
long rowCount = _rowCollection.Count;
int idColumnCount = idColumnList.Count;

var idColumns = new DataFrameColumn[idColumnCount];
for (int i = 0; i < idColumnCount; i++)
{
idColumns[i] = _columnCollection[idColumnList[i]];
}

foreach (var valueColumnName in valueColumnList)
{
var sourceValueColumn = _columnCollection[valueColumnName];

for (long sourceRow = 0; sourceRow < rowCount; sourceRow++)
{
var value = sourceValueColumn[sourceRow];

if (dropNulls && (value == null))
{
continue;
}

for (int i = 0; i < idColumnCount; i++)
{
outputIdCols[i][currentRow] = idColumns[i][sourceRow];
}

variableColumn[currentRow] = valueColumnName;
valueColumn[currentRow] = convertToString ? value?.ToString() : value;
currentRow++;
}
}
}

/// <summary>
/// Invalidates any cached data after a column has changed.
/// </summary>
Expand Down
74 changes: 73 additions & 1 deletion src/Microsoft.Data.Analysis/Strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading