/src/gpaligner/GPAligner.scala
Scala | 156 lines | 121 code | 18 blank | 17 comment | 12 complexity | 05ca76f6bc9bad81659506dd16a4b3d3 MD5 | raw file
- /*
- * GPAligner.scala
- *
- * To change this template, choose Tools | Template Manager
- * and open the template in the editor.
- */
- package gpaligner
- import scala.annotation.tailrec
- import CoreTypes._
- import CoreTypes.AlignmentResult._
- class GPAligner(inputFile: String, outputFile: String) {
- val input = new GPInput(inputFile)
- val output = new AlignedOutput(outputFile)
- val solvedModel = new AlignmentModel
- val unsolvedModel = new AlignmentModel
- val generator = new KanjidicGenerator(120)
- val alpha = 2.5
- val solvedWeight = 0.5
- val unsolvedWeight = 0.5
- require(0 < alpha)
- // require(alpha < unsolvedWeight)
- require(unsolvedWeight <= solvedWeight)
- class AlignmentStats(var overConstrained: Int = 0, var unique: Int = 0, var ambiguous: Int = 0, var excessive: Int = 0) {
- def log: Unit = {
- println("%d overconstrained/%d resolved/%d ambiguous/%d excessive".format(overConstrained, unique, ambiguous, excessive))
- }
- }
- def align: Unit = {
- val stats = new AlignmentStats
- println("FIRST PASS -- triaging entries")
- val (clouds, excessive) = buildClouds
- println("SECOND PASS -- %d ambiguous entries".format(clouds.size))
- val box = new CountDown(clouds.size)
- alignIteratively(clouds, box)
- box.finish
- println("Ignoring %d excessive entries".format(excessive.size))
- // if (excessive.size > 0) {
- // println("THIRD PASS -- %d long entries".format(excessive.size))
- // alignOnePass(excessive)
- // }
- output.close
- }
- /**
- * Align the clouds one at a time, updating the alignment model as we go.
- */
- @tailrec
- private def alignIteratively(clouds: List[AlignmentCloud], box: CountDown): Unit = {
- if (!clouds.isEmpty) {
- if (clouds.size % 100 == 0) {
- box.update(clouds.size)
- }
- val (best :: rest) = clouds.sortWith(_.bestScore > _.bestScore)
- val (alignedEntry, remainder) = best.resolve
- solvedModel.inc(alignedEntry.alignment)
- unsolvedModel.dec(remainder.iterator)
- output.consume(alignedEntry)
- alignIteratively(rest, box)
- }
- }
- /**
- * Align the clouds without changing the alignment model.
- */
- private def alignOnePass(entries: List[Entry]): Unit = {
- val box = new CountDown(entries.size)
- var n = entries.size
- for (entry <- entries) {
- val cloud = rescore(buildCloud(entry, generator))
- val (alignedEntry, remainder) = cloud.resolve
- output.consume(alignedEntry)
- n -= 1
- if (n % 20 == 0) {
- box.update(n)
- }
- }
- box.finish
- }
- private def buildClouds: Pair[List[AlignmentCloud], List[Entry]] = {
- var clouds: List[AlignmentCloud] = Nil
- var excessive: List[Entry] = Nil
- val stats = new AlignmentStats
- var total = 0
- val box = new CountUp
- for (entry <- input) {
- if (generator.isExcessive(entry)) {
- stats.excessive += 1
- excessive = entry :: excessive
- } else {
- val cloud = buildCloud(entry, generator)
- cloud.status match {
- case Overconstrained => {
- stats.overConstrained += 1
- }
- case Resolved => {
- stats.unique += 1
- val (result, remainder) = cloud.resolve
- require(remainder.size == 0)
- solvedModel.inc(result.alignment)
- output.consume(result)
- }
- case Ambiguous => {
- stats.ambiguous += 1
- unsolvedModel.inc(cloud.alignments)
- clouds = cloud :: clouds
- }
- }
- }
- total += 1
- if (total % 1000 == 0) {
- box.update(total)
- }
- }
- box.finish(total)
- stats.log
- (clouds, excessive)
- }
- private def buildCloud(entry: Entry, generator: AlignmentGenerator): AlignmentCloud = {
- val alignments: Stream[Alignment] = generator.generate(entry)
- AlignmentCloud(entry, alignments.map(ScoredAlignment(Double.NaN, _)))
- }
- private def rescore(cs: List[AlignmentCloud]): List[AlignmentCloud] = cs.map(rescore(_))
- private def rescore(c: AlignmentCloud): AlignmentCloud = {
- new AlignmentCloud(c.entry, c.scoredAlignments.map(rescore(_)))
- }
- private def rescore(sa: ScoredAlignment): ScoredAlignment = {
- ScoredAlignment(rescore(sa.alignment), sa.alignment)
- }
- private def rescore(a: Alignment): Double = {
- Stats.mean(a.iterContext.map(rescore(_)))
- }
- private def rescore(gpc: GPContext): Double = {
- val tf = (wtf(gpc.pivot) - unsolvedWeight + alpha) / wtf(gpc.pivot.grapheme)
- val idf = Math.log(wtf(gpc.pivot) / (wtf(gpc) - unsolvedWeight + alpha))
- tf * idf
- }
- private def wtf(grapheme: String) = unsolvedWeight * unsolvedModel.gDist.getCount(grapheme) + solvedWeight * solvedModel.gDist.getCount(grapheme)
- private def wtf(gp: Segment) = unsolvedWeight * unsolvedModel.gpDist.getCount(gp) + solvedWeight * solvedModel.gpDist.getCount(gp)
- private def wtf(gpc: GPContext) = unsolvedWeight * unsolvedModel.gpcDist.getCount(gpc) + solvedWeight * solvedModel.gpcDist.getCount(gpc)
- }