PageRenderTime 25ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/dataset/app/models/DataSetStatistics.scala

https://github.com/delving/culture-hub
Scala | 113 lines | 80 code | 27 blank | 6 comment | 3 complexity | 7a3ab0e426ea9095ceb432c7bae752d7 MD5 | raw file
  1. package models
  2. package statistics
  3. import _root_.util.OrganizationConfigurationHandler
  4. import java.util.Date
  5. import org.bson.types.ObjectId
  6. import com.novus.salat.dao.SalatDAO
  7. import HubMongoContext._
  8. import eu.delving.stats.Stats
  9. import collection.JavaConverters._
  10. import com.mongodb.casbah.Imports._
  11. /**
  12. *
  13. * @author Manuel Bernhardt <bernhardt.manuel@gmail.com>
  14. */
  15. case class DataSetStatistics(_id: ObjectId = new ObjectId,
  16. context: DataSetStatisticsContext,
  17. recordCount: Int,
  18. fieldCount: Histogram) {
  19. def getStatisticsFile = {
  20. hubFileStores.getResource(OrganizationConfigurationHandler.getByOrgId(context.orgId)).findOne(MongoDBObject("orgId" -> context.orgId, "spec" -> context.spec, "uploadDate" -> context.uploadDate))
  21. }
  22. def getHistogram(path: String)(implicit configuration: OrganizationConfiguration): Option[Histogram] = DataSetStatistics.dao.frequencies.
  23. findByParentId(_id, MongoDBObject("context.orgId" -> context.orgId, "context.spec" -> context.spec, "context.uploadDate" -> context.uploadDate, "path" -> path)).toList.headOption.
  24. map(_.histogram)
  25. }
  26. case class FieldFrequencies(_id: ObjectId = new ObjectId,
  27. parentId: ObjectId,
  28. context: DataSetStatisticsContext,
  29. path: String,
  30. histogram: Histogram)
  31. case class FieldValues(_id: ObjectId = new ObjectId,
  32. parentId: ObjectId,
  33. context: DataSetStatisticsContext,
  34. path: String,
  35. valueStats: ValueStats)
  36. case class Histogram(present: Int,
  37. absent: Int,
  38. counterMap: Map[String, Counter] = Map.empty)
  39. object Histogram {
  40. def apply(histogram: Stats.Histogram): Histogram = Histogram(
  41. present = histogram.present,
  42. absent = histogram.absent
  43. // TODO if we need this we have to think about how to store these things, since the values don't perform too well as keys
  44. // counterMap = histogram.counterMap.asScala.map(h => (h._1 -> Counter(h._2.count, h._2.percentage, h._2.value, h._2.proportion))).toMap
  45. )
  46. }
  47. case class Counter(count: Int,
  48. percentage: String,
  49. value: String,
  50. proportion: Double)
  51. case class ValueStats(total: Int,
  52. unique: Boolean,
  53. values: Option[Histogram],
  54. wordCounts: Option[Histogram])
  55. object ValueStats {
  56. def apply(s: Stats.ValueStats): ValueStats = ValueStats(
  57. total = s.total,
  58. unique = if (s.unique == null) false else s.unique,
  59. values = Option(s.values).map(Histogram(_)),
  60. wordCounts = Option(s.wordCounts).map(Histogram(_))
  61. )
  62. }
  63. case class DataSetStatisticsContext(orgId: String,
  64. spec: String,
  65. schema: String,
  66. provider: String,
  67. dataProvider: String,
  68. providerUri: String,
  69. dataProviderUri: String,
  70. uploadDate: Date)
  71. object DataSetStatistics extends MultiModel[DataSetStatistics, DataSetStatisticsDAO] {
  72. def connectionName: String = "DataSetStatistics"
  73. def initIndexes(collection: MongoCollection) {
  74. addIndexes(collection, dataSetStatisticsContextIndexes, dataSetStatisticsContextIndexNames)
  75. }
  76. def initDAO(collection: MongoCollection, connection: MongoDB)(implicit configuration: OrganizationConfiguration): DataSetStatisticsDAO = new DataSetStatisticsDAO(collection, connection)
  77. }
  78. class DataSetStatisticsDAO(collection: MongoCollection, connection: MongoDB) extends SalatDAO[DataSetStatistics, ObjectId](collection) {
  79. lazy val fieldFrequencies = connection("DataSetStatisticsFieldFrequencies")
  80. addIndexes(fieldFrequencies, dataSetStatisticsContextIndexes, dataSetStatisticsContextIndexNames)
  81. lazy val fieldValues = connection("DataSetStatisticsFieldValues")
  82. addIndexes(fieldValues, dataSetStatisticsContextIndexes, dataSetStatisticsContextIndexNames)
  83. val frequencies = new ChildCollection[FieldFrequencies, ObjectId](collection = fieldFrequencies, parentIdField = "parentId") {}
  84. val values = new ChildCollection[FieldValues, ObjectId](collection = fieldValues, parentIdField = "parentId") {}
  85. def getMostRecent(orgId: String, spec: String, schema: String) = find(MongoDBObject("context.orgId" -> orgId, "context.spec" -> spec, "context.schema" -> schema)).$orderby(MongoDBObject("_id" -> -1)).limit(1).toList.headOption
  86. }