PageRenderTime 49ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/Sources/DataAccess/Analyze.cs

https://github.com/tpwalke2/DataTable
C# | 512 lines | 318 code | 82 blank | 112 comment | 23 complexity | 6453edbe14be1e67108b9c7a1ef4047f MD5 | raw file
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using System.IO;
  6. using System.Diagnostics.CodeAnalysis;
  7. namespace DataAccess
  8. {
  9. /// <summary>
  10. /// Analysis operations on tables, like joins, histogram, dup search, filter, etc.
  11. /// These handle large tables.
  12. /// </summary>
  13. public static class Analyze
  14. {
  15. /// <summary>
  16. /// Given a potentially extremely large table, shred it into smaller CSV files based on the values in columnName.
  17. /// This can be very useful for easily building an index for a large file.
  18. /// For each unique value in column, funcCreateStream is invoked with that value to get a TextWriter. The csv is written to that writer.
  19. /// The ordering within each small file is preserved
  20. /// This stream based overload is useful when you need to avoid writing to the local file system (such as with Azure storage)
  21. /// </summary>
  22. /// <param name="table">original table to shred</param>
  23. /// <param name="funcCreateStream">callback function to create a stream for each new table.</param>
  24. /// <param name="columnName">column name to use for shredding. You can use <see cref="GetColumnValueCounts"/>
  25. /// to see the variation in each column to determine a good column to use for shredding.
  26. /// </param>
  27. public static void Shred(DataTable table, string columnName, Func<string, TextWriter> funcCreateStream)
  28. {
  29. Dictionary<string, TextWriter> dict = new Dictionary<string, TextWriter>();
  30. try
  31. {
  32. foreach (Row row in table.Rows)
  33. {
  34. TextWriter tw;
  35. string val = row[columnName];
  36. if (!dict.TryGetValue(val, out tw))
  37. {
  38. // New value
  39. tw = funcCreateStream(val);
  40. dict[val] = tw;
  41. CsvWriter.RawWriteLine(table.ColumnNames, tw); // header
  42. }
  43. CsvWriter.RawWriteLine(row.Values, tw);
  44. }
  45. }
  46. finally
  47. {
  48. foreach (var kv in dict)
  49. {
  50. kv.Value.Close();
  51. }
  52. }
  53. }
  54. /// <summary>
  55. /// Given a potentially extremely large table, shred it into smaller CSV files based on the values in columnName.
  56. /// This can be very useful for easily building an index for a large file.
  57. /// For each unique value in column, a CSV file is created and named string.Format(templateFilename, value).
  58. /// The ordering within each small file is preserved
  59. /// </summary>
  60. /// <param name="table">original table to shred</param>
  61. /// <param name="columnName">column name to use for shredding. You can use <see cref="GetColumnValueCounts"/>
  62. /// to see the variation in each column to determine a good column to use for shredding.
  63. /// </param>
  64. /// <param name="templateFilename">template specifying filename of shredded files.</param>
  65. public static void Shred(DataTable table, string columnName, string templateFilename)
  66. {
  67. Func<string, TextWriter> func =
  68. (value) =>
  69. {
  70. string destination = string.Format(templateFilename, value);
  71. TextWriter tw = new StreamWriter(destination);
  72. return tw;
  73. };
  74. Shred(table, columnName, func);
  75. }
  76. /// <summary>
  77. /// Apply a Where filter to a table. This can stream over large data and filter it down.
  78. /// </summary>
  79. /// <param name="table">source table</param>
  80. /// <param name="fpSelector">predicate to execute on each row</param>
  81. /// <returns>a new table that copies out rows from from the source table</returns>
  82. public static DataTable Where(DataTable table, Func<Row, bool> fpSelector)
  83. {
  84. TableWriter writer = new TableWriter(table);
  85. int count = 0;
  86. foreach(Row row in table.Rows)
  87. {
  88. bool keep = fpSelector(row);
  89. if (keep)
  90. {
  91. writer.AddRow(row);
  92. count++;
  93. }
  94. }
  95. return writer.CloseAndGetTable();
  96. }
  97. // $$$ Clarify - multiple joins (inner, outer, etc)
  98. /// <summary>
  99. /// Performs a full outer join on two in-memory tables and returns a new table.
  100. /// The number of rows in the resulting table is the sum of rows from each source table.
  101. /// The number of columns in teh new table is the sum of columns in the the source tables minus 1
  102. /// (since the join column is redundant)
  103. /// </summary>
  104. /// <param name="d1"></param>
  105. /// <param name="d2"></param>
  106. /// <param name="columnName">column name to join on. Both tables must have this column name.</param>
  107. /// <returns>a new table</returns>
  108. public static MutableDataTable Join(MutableDataTable d1, MutableDataTable d2, string columnName)
  109. {
  110. Column c1 = d1.GetColumn(columnName);
  111. if (c1 == null)
  112. {
  113. throw new InvalidOperationException("Missing column");
  114. }
  115. Column c2 = d2.GetColumn(columnName);
  116. if (c2 == null)
  117. {
  118. throw new InvalidOperationException("Missing column");
  119. }
  120. // Place d1 in first set of columns, and d2 in second set.
  121. int kColumn = d1.Columns.Length;
  122. int kTotalColumns = kColumn + d2.Columns.Length;
  123. // Indices into new table where join columns are.
  124. int joinColumn1 = Utility.GetColumnIndexFromName(d1.ColumnNames, columnName);
  125. int joinColumn2 = Utility.GetColumnIndexFromName(d2.ColumnNames, columnName) + kColumn;
  126. // $$$ could really optimize. Sort both on column and then zip.
  127. Dictionary<string, int> m1 = GetRowIndex(c1);
  128. Dictionary<string, int> m2 = GetRowIndex(c2);
  129. // $$$ column names may not be unique.
  130. //string[] headers = d1.ColumnNames.Union(d2.ColumnNames).ToArray();
  131. string[] headers = new string[kTotalColumns];
  132. Array.Copy(d1.ColumnNames.ToArray(), 0, headers, 0, kColumn);
  133. Array.Copy(d2.ColumnNames.ToArray(), 0, headers, kColumn, kTotalColumns - kColumn);
  134. string[] values = new string[headers.Length];
  135. string path = GetTempFileName();
  136. using (CsvWriter tw = new CsvWriter(path, headers))
  137. {
  138. foreach (var kv in m1)
  139. {
  140. Clear(values);
  141. string key = kv.Key; // join column
  142. int r1 = kv.Value;
  143. int r2;
  144. if (m2.TryGetValue(key, out r2))
  145. {
  146. // In both. write out
  147. CopyRowIntoArray(values, kColumn, d2, r2);
  148. m2.Remove(key);
  149. }
  150. else
  151. {
  152. // Only in M1.
  153. }
  154. CopyRowIntoArray(values, 0, d1, r1);
  155. values[joinColumn1] = values[joinColumn2] = key;
  156. tw.WriteRow(values);
  157. }
  158. // We remove all of M1's items from m2, so M2 is just unique items now. (possibly 0).
  159. // Tag those onto the end.
  160. foreach (var kv in m2)
  161. {
  162. int r2 = kv.Value;
  163. Clear(values);
  164. CopyRowIntoArray(values, kColumn, d2, r2);
  165. values[joinColumn1] = values[joinColumn2] = kv.Key;
  166. tw.WriteRow(values);
  167. }
  168. } // close tw
  169. MutableDataTable t = Reader.ReadCSV(path);
  170. DeleteLocalFile(path);
  171. // Remove duplicate columns.
  172. t.DeleteColumn(joinColumn2);
  173. return t;
  174. }
  175. static void CopyRowIntoArray(string[] values, int index, MutableDataTable d, int row)
  176. {
  177. for (int c = 0; c < d.Columns.Length; c++)
  178. {
  179. values[index] = d.Columns[c].Values[row];
  180. index++;
  181. }
  182. }
  183. static void Clear(string[] values)
  184. {
  185. for (int i = 0; i < values.Length; i++)
  186. {
  187. values[i] = string.Empty;
  188. }
  189. }
  190. static Dictionary<string, int> GetRowIndex(Column c)
  191. {
  192. Dictionary<string, int> d = new Dictionary<string, int>();
  193. for (int row = 0; row < c.Values.Length; row++)
  194. {
  195. string x = c.Values[row].ToUpperInvariant();
  196. // If this add fails, it means the column we're doing a join on has duplicate entries.
  197. d.Add(x, row); // verifies uniqueness
  198. }
  199. return d;
  200. }
  201. /// <summary>
  202. /// Return a sample that's the top N records from a table.
  203. /// This is useful to sample a large table and then save the sample.
  204. /// </summary>
  205. /// <param name="table">source table</param>
  206. /// <param name="topN">positive value specifying number of rows to copy from from source table</param>
  207. /// <returns>The topN rows from the source table.</returns>
  208. public static MutableDataTable SampleTopN(DataTable table, int topN)
  209. {
  210. if (topN <= 0)
  211. {
  212. throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer");
  213. }
  214. TableWriter writer = new TableWriter(table);
  215. foreach (var row in table.Rows)
  216. {
  217. topN--;
  218. writer.AddRow(row);
  219. if (topN == 0)
  220. {
  221. // Check topN before the enumeration to avoid pulling a wasted row from the source table
  222. break;
  223. }
  224. }
  225. return DataTable.New.GetMutableCopy(writer.CloseAndGetTable());
  226. }
  227. private static int[] GetColumnIndexFromNames(DataTable table, string[] columnNames)
  228. {
  229. return Array.ConvertAll(columnNames, columnName => GetColumnIndexFromName(table, columnName));
  230. }
  231. // Return 0-based index of column with matching name.
  232. // throws an exception if not found
  233. private static int GetColumnIndexFromName(DataTable table, string columnName)
  234. {
  235. string[] columnNames = table.ColumnNames.ToArray();
  236. return Utility.GetColumnIndexFromName(columnNames, columnName);
  237. }
  238. /// <summary>
  239. /// Extract column as a histogram, sorted in descending order by frequency.
  240. /// </summary>
  241. /// <param name="table">source table</param>
  242. /// <param name="columnName">column within short table</param>
  243. /// <returns>collection of tuples, where each tuple is a value and the count of that value within the column</returns>
  244. public static Tuple<string, int>[] AsHistogram(DataTable table, string columnName)
  245. {
  246. int i = GetColumnIndexFromName(table, columnName);
  247. return AsHistogram(table, i);
  248. }
  249. /// <returns></returns>
  250. /// <summary>
  251. /// Extract column as a histogram, sorted in descending order by frequency.
  252. /// </summary>
  253. /// <param name="table">source table</param>
  254. /// <param name="columnIdx">0-based index of column </param>
  255. /// <returns>collection of tuples, where each tuple is a value and the count of that value within the column</returns>
  256. public static Tuple<string, int>[] AsHistogram(DataTable table, int columnIdx)
  257. {
  258. Dictionary<string, int> values = new Dictionary<string, int>();
  259. //string name = "unknown";
  260. foreach (Row row in table.Rows)
  261. {
  262. var parts = row.Values;
  263. if (columnIdx >= parts.Count)
  264. {
  265. // malformed input file
  266. continue;
  267. }
  268. string p = parts[columnIdx];
  269. int count;
  270. values.TryGetValue(p, out count);
  271. count++;
  272. values[p] = count;
  273. }
  274. // Get top N?
  275. var items = from kv in values
  276. orderby kv.Value descending
  277. select Tuple.Create(kv.Key, kv.Value)
  278. ;
  279. //int N = 10;
  280. //return items.Take(N).ToArray();
  281. return items.ToArray();
  282. }
  283. /// <summary>
  284. /// Produces a table where each row is the number of unique values in a source column, followed by the top N occurences in that column.
  285. /// </summary>
  286. /// <param name="table">source table</param>
  287. /// <param name="N">number of top N occurences to include in the summary table </param>
  288. /// <returns>a summary table</returns>
  289. public static MutableDataTable GetColumnValueCounts(DataTable table, int N)
  290. {
  291. if (N < 0)
  292. {
  293. throw new ArgumentOutOfRangeException("N");
  294. }
  295. string[] names = table.ColumnNames.ToArray();
  296. int count = names.Length;
  297. MutableDataTable dSummary = new MutableDataTable();
  298. Column c1 = new Column("column name", count);
  299. Column c2 = new Column("count", count);
  300. int kFixed = 2;
  301. Column[] cAll = new Column[kFixed + N * 2];
  302. cAll[0] = c1;
  303. cAll[1] = c2;
  304. for (int i = 0; i < N; i++)
  305. {
  306. cAll[i * 2 + kFixed] = new Column("Top Value " + i, count);
  307. cAll[i * 2 + 1 + kFixed] = new Column("Top Occurrence " + i, count);
  308. }
  309. dSummary.Columns = cAll;
  310. int columnId = 0;
  311. foreach (string name in names)
  312. {
  313. Tuple<string, int>[] hist = AsHistogram(table, columnId);
  314. c1.Values[columnId] = name;
  315. c2.Values[columnId] = hist.Length.ToString();
  316. for (int i = 0; i < N; i++)
  317. {
  318. if (i >= hist.Length)
  319. {
  320. break;
  321. }
  322. cAll[i * 2 + kFixed].Values[columnId] = hist[i].Item1;
  323. cAll[i * 2 + 1 + kFixed].Values[columnId] = hist[i].Item2.ToString();
  324. }
  325. columnId++;
  326. }
  327. return dSummary;
  328. }
  329. /// <summary>
  330. /// Find all rows that have dups for the given columns.
  331. /// This uses a multi-pass algorithm to operate on a large data file.
  332. /// </summary>
  333. /// <param name="table">original table</param>
  334. /// <param name="columnNames">set of columns to compare to look for duplicates</param>
  335. /// <returns>a table that's a subset of the original table</returns>
  336. public static DataTable SelectDuplicates(DataTable table, params string[] columnNames)
  337. {
  338. int[] ci = GetColumnIndexFromNames(table, columnNames);
  339. // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets
  340. // But then we do need to handle collisions.
  341. HashSet<int> allKeys = new HashSet<int>();
  342. HashSet<int> possibleDups = new HashSet<int>();
  343. //
  344. // Take a first pass and store the hash of each row's unique Key
  345. //
  346. foreach (Row row in table.Rows)
  347. {
  348. var parts = row.Values;
  349. int hash = CalcHash(parts, ci);
  350. if (allKeys.Contains(hash))
  351. {
  352. possibleDups.Add(hash);
  353. }
  354. else
  355. {
  356. allKeys.Add(hash);
  357. }
  358. }
  359. allKeys = null; // Free up for GC
  360. //
  361. // Now take a second pass through the dups.
  362. //
  363. Dictionary<string, Row> fullMatch = new Dictionary<string, Row>();
  364. StringBuilder sb = new StringBuilder();
  365. TableWriter writer = new TableWriter(table);
  366. foreach (Row row in table.Rows)
  367. {
  368. {
  369. var parts = row.Values;
  370. int hash = CalcHash(parts, ci);
  371. if (!possibleDups.Contains(hash))
  372. {
  373. continue;
  374. }
  375. // Potential match
  376. sb.Clear();
  377. foreach (int i in ci)
  378. {
  379. sb.Append(parts[i]);
  380. sb.Append(',');
  381. }
  382. string key = sb.ToString();
  383. if (fullMatch.ContainsKey(key))
  384. {
  385. Row firstLine = fullMatch[key];
  386. if (firstLine != null)
  387. {
  388. writer.AddRow(firstLine);
  389. fullMatch[key] = null;
  390. }
  391. // Real dup!
  392. writer.AddRow(row);
  393. }
  394. else
  395. {
  396. fullMatch[key] = row;
  397. }
  398. }
  399. } // reader
  400. return writer.CloseAndGetTable();
  401. }
  402. // Helper for finding duplicates.
  403. private static int CalcHash(IList<string> parts, int[] ci)
  404. {
  405. int h = 0;
  406. foreach (int i in ci)
  407. {
  408. h += parts[i].GetHashCode();
  409. }
  410. return h;
  411. }
  412. /// <summary>
  413. /// For azure usage, allow hooking the function used to create temporary files
  414. /// </summary>
  415. [SuppressMessage("Microsoft.Usage", "CA2211:NonConstantFieldsShouldNotBeVisible")]
  416. public static Func<string> GetTempFileName = System.IO.Path.GetTempFileName;
  417. [SuppressMessage("Microsoft.Design", "CA1031:DoNotCatchGeneralExceptionTypes")]
  418. private static void DeleteLocalFile(string file)
  419. {
  420. try
  421. {
  422. System.IO.File.Delete(file);
  423. }
  424. catch
  425. {
  426. // Not fatal.
  427. }
  428. }
  429. }
  430. }