Skip to content

Commit

Permalink
Fix: Columnar handling of duplicate column values (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
thekaveman authored Jan 11, 2022
2 parents 651dafd + 0d31af1 commit a27e486
Show file tree
Hide file tree
Showing 2 changed files with 308 additions and 118 deletions.
241 changes: 162 additions & 79 deletions src/Data/Csv/Columnar.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,131 +7,155 @@

namespace HashFields.Data.Csv
{
/// <summary>
/// Helper to work with delimited tabular data as columns rather than rows.
/// </summary>
/// <see cref="IEquatable{T}" />
internal class Columnar : IEquatable<Columnar>
{
private readonly List<string> _headers = new();
private readonly List<string> _header = new();
private readonly Dictionary<string, List<string>> _data = new();
private readonly string _delimiter;

/// <summary>
/// The column of values by column name.
/// </summary>
/// <param name="key">The name of the column.</param>
/// <returns>A list representing the column's values.</returns>
public List<string> this[string key] { get => _data[key]; }
public List<string> this[int index] { get => _data[_headers[index]]; }
public List<string> Header { get => _headers.ToList(); }
public List<List<string>> Columns { get => _data.Values.ToList(); }

public Columnar(string delimiter) : this(new MemoryStream(), delimiter)
{
}
/// <summary>
/// The column of values by column index.
/// </summary>
/// <param name="index">The 0-based index of the column.</param>
/// <returns>A list representing the column's values.</returns>
public List<string> this[int index] { get => _data[_header[index]]; }

/// <summary>
/// The list of column names.
/// </summary>
public List<string> Header { get => _header.ToList(); }

/// <summary>
/// The list of data columns.
/// </summary>
public List<List<string>> Columns { get => _data.Values.ToList(); }

/// <summary>
/// Initialize a new <c>Columnar</c> for delimited data.
/// </summary>
/// <param name="stream">The <c>Stream</c> of data to read into this <c>Columnar</c>.</param>
/// <param name="delimiter">The delimiter used between fields in the data.</param>
public Columnar(Stream stream, string delimiter)
{
if (stream is not null)
{
var tuple = Parse(stream, delimiter);
_delimiter = delimiter;

var tuple = Parse(stream, _delimiter);

_headers = tuple.Item1;
_header = tuple.Item1;
_data = tuple.Item2;
}
}

/// <summary>
/// Call a function for each value in the specified columns.
/// </summary>
/// <param name="func">
/// A function taking a string as input and returning a string.
/// Each value in the column is passed through this function and
/// overwritten in-place.
/// </param>
/// <param name="columns">The list of columns to apply the function on.</param>
public void Apply(Func<string, string> func, params string[] columns)
{
foreach (var column in _headers.Intersect(columns).ToArray())
foreach (var column in _header.Intersect(columns).ToArray())
{
_data[column] = _data[column].ConvertAll(s => func(s));
}
}

public bool Equals(Columnar other)
{
if (other is null)
{
return false;
}

if (!_headers.SequenceEqual(other._headers))
{
return false;
}

foreach (var column in _data)
{
if (!column.Value.SequenceEqual(other._data[column.Key]))
{
return false;
}
}

return true;
}

public override bool Equals(object obj)
{
if (obj is null)
{
return false;
}

if (obj is not Columnar columnar)
{
return false;
}

return Equals(columnar);
}

public override int GetHashCode()
{
var hashcode = new HashCode();
foreach (var header in _headers)
{
hashcode.Add(header);
}
foreach (var column in _data.Values)
{
foreach (var val in column)
{
hashcode.Add(val);
}
}
return hashcode.ToHashCode();
}

/// <summary>
/// Remove the named columns from this <c>Columnar</c> data.
/// The column names should match those found in the <c>Header</c>.
/// </summary>
/// <seealso cref="Header" />
/// <param name="columns">The list of column names to remove.</param>
public void Remove(params string[] columns)
{
foreach (var column in _headers.Intersect(columns).ToArray())
// find intersection of the real header names and those for removal
// create a new array from this intersection so we don't loop over
// the collection we are modifying!
foreach (var column in _header.Intersect(columns).ToArray())
{
_headers.Remove(column);
_header.Remove(column);
_data.Remove(column);
}
}

public void Write(Stream destination)
{
using var sw = new StreamWriter(destination);
foreach (var row in Rows())
{
sw.WriteLine(String.Join(",", row));
}
}

private List<List<string>> Rows()
/// <summary>
/// Compute the list of data rows from the current state of this <c>Columnar</c>.
/// </summary>
public List<List<string>> Rows()
{
// find the column with the longest length (N) - the number of rows
// create a list of N lists to represent the rows
var rows = Enumerable.Range(0, Columns.Max(c => c.Count))
.Select(_ => new List<string>())
.ToList();

foreach (var column in Columns)
{
foreach (var val in column)
// copy values for this column into each row
for (int i = 0; i < column.Count; i++)
{
rows[column.IndexOf(val)].Add(val);
// rows[i] is a list representing the ith row
// append the column value to the end of the row list
// the "next" position in the row
rows[i].Add(column[i]);
}
}

// insert the header row first
rows.Insert(0, Header);

return rows;
}

/// <summary>
/// Write this <c>Columnar</c> data to a stream as delimited tabular data.
/// </summary>
/// <param name="destination">A writable <c>Steam</c> target for this <c>Columnar</c>.</param>
public void Write(Stream destination)
{
using var sw = new StreamWriter(destination);
foreach (var row in Rows())
{
sw.WriteLine(String.Join(_delimiter, row));
}
}

/// <summary>
/// Read delimited data from a stream and convert into columnar format.
/// </summary>
/// <param name="stream">The source of data.</param>
/// <param name="delimiter">The delimiter used to separate fields in the data.</param>
/// <returns>A <c>Tuple</c> containing two items:
/// <list type="bullet">
/// <item>
/// <term><c>List{String}</c></term>
/// <description>The ordered header row of column names.</description>
/// </item>
/// <item>
/// <term><c>Dictionary{String,List{String}}</c></term>
/// <description>
/// The data columns, where the key is the column name
/// and the value is the list of values in the column.
/// </description>
/// </item>
/// </list>
/// </returns>
private static Tuple<List<string>, Dictionary<string, List<string>>> Parse(Stream stream, string delimiter)
{
var header = new List<string>();
Expand Down Expand Up @@ -170,5 +194,64 @@ private static Tuple<List<string>, Dictionary<string, List<string>>> Parse(Strea
)
);
}

#region IEquatable<Columnar>

public bool Equals(Columnar other)
{
if (other is null)
{
return false;
}

if (!_header.SequenceEqual(other._header))
{
return false;
}

foreach (var column in _data)
{
if (!column.Value.SequenceEqual(other._data[column.Key]))
{
return false;
}
}

return true;
}

public override bool Equals(object obj)
{
if (obj is null)
{
return false;
}

if (obj is not Columnar columnar)
{
return false;
}

return Equals(columnar);
}

public override int GetHashCode()
{
var hashcode = new HashCode();
foreach (var header in _header)
{
hashcode.Add(header);
}
foreach (var column in _data.Values)
{
foreach (var val in column)
{
hashcode.Add(val);
}
}
return hashcode.ToHashCode();
}

#endregion
}
}
Loading

0 comments on commit a27e486

Please sign in to comment.