/src/main/scala/com/minzc/triple/exp/StrTools.scala
Scala | 80 lines | 66 code | 10 blank | 4 comment | 2 complexity | 9fe4db5e27031635ce96e955d5ce47d9 MD5 | raw file
- package com.minzc.triple.exp
- import org.apache.commons.cli.{Options, CommandLine}
- import org.apache.hadoop.conf.Configuration
- import _root_.util.{AdvFile, TripleConsts}
- import scala.io.Source
- import java.util
- import com.yeezhao.commons.util.{CliRunner, StringUtil, FreqDist, AdvCli}
- import scala.collection.JavaConversions._
- import java.io.{FileInputStream, FileWriter}
- import org.ansj.recognition.NatureRecognition
- import org.ansj.splitWord.analysis.ToAnalysis
- /**
- * @author congzicun
- * @since 17/6/14 11:56 AM
- */
- object StrToolsRunner {
- def main(args: Array[String]) = {
- AdvCli.initRunner(args, "StringTools", new StrTools())
- }
- }
- class StrTools extends CliRunner {
- val PARAM_FUNC = "func"
- val SEGMENT = "seg"
- val SEGANSJ = "ansj"
- val conf = new Configuration()
- conf.addResource(TripleConsts.CONFIG_FILE)
- def start(cmdLine: CommandLine) = {
- cmdLine.getOptionValue(PARAM_FUNC) match {
- case SEGMENT => segFile(cmdLine.getOptionValue(AdvCli.CLI_PARAM_I), cmdLine.getOptionValue(AdvCli.CLI_PARAM_O))
- case SEGANSJ => ansjSegFile(cmdLine.getOptionValue(AdvCli.CLI_PARAM_I), cmdLine.getOptionValue(AdvCli.CLI_PARAM_O))
- }
- }
- def initOptions(): Options = {
- val options = new Options()
- options.addOption(AdvCli.CLI_PARAM_I, true, "input file")
- options.addOption(AdvCli.CLI_PARAM_O, true, "output file")
- options.addOption(PARAM_FUNC, true, "function")
- }
- def validateOptions(cmdLine: CommandLine): Boolean = cmdLine.hasOption(AdvCli.CLI_PARAM_I) && cmdLine.hasOption(AdvCli.CLI_PARAM_O)
- def ansjSegFile(inputFile: String, outputFile: String) = {
- val counter = new FreqDist[String]
- AdvFile.loadFileInDelimitLine(new FileInputStream(inputFile), (ln) => {
- val posTags = ToAnalysis.parse(ln)
- new NatureRecognition(posTags).recognition()
- posTags.filter(_.getNatrue.natureStr.contains("n")).foreach(term => counter.incr(term.getName + "$" + term.getNatrue.natureStr))
- })
- val fileWriter = new FileWriter(outputFile)
- counter.foreach(pair => fileWriter.write("%s\t%s\n".format(pair._1, pair._2)))
- fileWriter.close()
- println("output file is " + outputFile)
- }
- def segFile(inputFile: String, outputFile: String) = {
- val dic = new util.HashMap[String, String]
- AdvFile.loadFileInDelimitLine(conf.getConfResourceAsInputStream(conf.get(TripleConsts.CONF_DIC_FILE)), (ln: String) => dic.put(ln, ln))
- val fileWriter = new FileWriter(outputFile)
- var counter = 0
- for (line <- Source.fromFile(inputFile).getLines()) {
- val wordsPos = StringUtil.backwardMaxMatch(line, dic, 100, 1)
- val segrst = wordsPos.map(position => {
- val word = line.substring(position.first, position.second)
- word
- }).mkString(" ")
- fileWriter.write(segrst + '\n')
- counter += 1
- println("Finishing processing %s\r".format(counter))
- }
- fileWriter.close()
- println("Output file is " + outputFile)
- }
- }