PageRenderTime 97ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/com/minzc/triple/exp/StrTools.scala

https://github.com/Minzc/triple
Scala | 80 lines | 66 code | 10 blank | 4 comment | 2 complexity | 9fe4db5e27031635ce96e955d5ce47d9 MD5 | raw file
  1. package com.minzc.triple.exp
  2. import org.apache.commons.cli.{Options, CommandLine}
  3. import org.apache.hadoop.conf.Configuration
  4. import _root_.util.{AdvFile, TripleConsts}
  5. import scala.io.Source
  6. import java.util
  7. import com.yeezhao.commons.util.{CliRunner, StringUtil, FreqDist, AdvCli}
  8. import scala.collection.JavaConversions._
  9. import java.io.{FileInputStream, FileWriter}
  10. import org.ansj.recognition.NatureRecognition
  11. import org.ansj.splitWord.analysis.ToAnalysis
  12. /**
  13. * @author congzicun
  14. * @since 17/6/14 11:56 AM
  15. */
  16. object StrToolsRunner {
  17. def main(args: Array[String]) = {
  18. AdvCli.initRunner(args, "StringTools", new StrTools())
  19. }
  20. }
  21. class StrTools extends CliRunner {
  22. val PARAM_FUNC = "func"
  23. val SEGMENT = "seg"
  24. val SEGANSJ = "ansj"
  25. val conf = new Configuration()
  26. conf.addResource(TripleConsts.CONFIG_FILE)
  27. def start(cmdLine: CommandLine) = {
  28. cmdLine.getOptionValue(PARAM_FUNC) match {
  29. case SEGMENT => segFile(cmdLine.getOptionValue(AdvCli.CLI_PARAM_I), cmdLine.getOptionValue(AdvCli.CLI_PARAM_O))
  30. case SEGANSJ => ansjSegFile(cmdLine.getOptionValue(AdvCli.CLI_PARAM_I), cmdLine.getOptionValue(AdvCli.CLI_PARAM_O))
  31. }
  32. }
  33. def initOptions(): Options = {
  34. val options = new Options()
  35. options.addOption(AdvCli.CLI_PARAM_I, true, "input file")
  36. options.addOption(AdvCli.CLI_PARAM_O, true, "output file")
  37. options.addOption(PARAM_FUNC, true, "function")
  38. }
  39. def validateOptions(cmdLine: CommandLine): Boolean = cmdLine.hasOption(AdvCli.CLI_PARAM_I) && cmdLine.hasOption(AdvCli.CLI_PARAM_O)
  40. def ansjSegFile(inputFile: String, outputFile: String) = {
  41. val counter = new FreqDist[String]
  42. AdvFile.loadFileInDelimitLine(new FileInputStream(inputFile), (ln) => {
  43. val posTags = ToAnalysis.parse(ln)
  44. new NatureRecognition(posTags).recognition()
  45. posTags.filter(_.getNatrue.natureStr.contains("n")).foreach(term => counter.incr(term.getName + "$" + term.getNatrue.natureStr))
  46. })
  47. val fileWriter = new FileWriter(outputFile)
  48. counter.foreach(pair => fileWriter.write("%s\t%s\n".format(pair._1, pair._2)))
  49. fileWriter.close()
  50. println("output file is " + outputFile)
  51. }
  52. def segFile(inputFile: String, outputFile: String) = {
  53. val dic = new util.HashMap[String, String]
  54. AdvFile.loadFileInDelimitLine(conf.getConfResourceAsInputStream(conf.get(TripleConsts.CONF_DIC_FILE)), (ln: String) => dic.put(ln, ln))
  55. val fileWriter = new FileWriter(outputFile)
  56. var counter = 0
  57. for (line <- Source.fromFile(inputFile).getLines()) {
  58. val wordsPos = StringUtil.backwardMaxMatch(line, dic, 100, 1)
  59. val segrst = wordsPos.map(position => {
  60. val word = line.substring(position.first, position.second)
  61. word
  62. }).mkString(" ")
  63. fileWriter.write(segrst + '\n')
  64. counter += 1
  65. println("Finishing processing %s\r".format(counter))
  66. }
  67. fileWriter.close()
  68. println("Output file is " + outputFile)
  69. }
  70. }