/dkpro-core-norvig-asl/src/main/java/org/dkpro/core/norvig/NorvigSpellingCorrector.java

https://github.com/dkpro/dkpro-core · Java · 106 lines · 69 code · 10 blank · 27 comment · 4 complexity · bd491220bcf747a9b72bf2492deb4ca2 MD5 · raw file

  1. /*
  2. * Copyright 2017
  3. * Ubiquitous Knowledge Processing (UKP) Lab
  4. * Technische Universität Darmstadt
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the "License");
  7. * you may not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.dkpro.core.norvig;
  19. import static org.apache.uima.fit.util.JCasUtil.select;
  20. import static org.apache.uima.fit.util.JCasUtil.selectCovered;
  21. import org.apache.uima.UimaContext;
  22. import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
  23. import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
  24. import org.apache.uima.fit.descriptor.ConfigurationParameter;
  25. import org.apache.uima.fit.descriptor.ResourceMetaData;
  26. import org.apache.uima.fit.descriptor.TypeCapability;
  27. import org.apache.uima.jcas.JCas;
  28. import org.apache.uima.resource.ResourceInitializationException;
  29. import org.dkpro.core.api.parameter.ComponentParameters;
  30. import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly;
  31. import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
  32. import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation;
  33. import eu.openminted.share.annotations.api.Component;
  34. import eu.openminted.share.annotations.api.Parameters;
  35. import eu.openminted.share.annotations.api.constants.OperationType;
  36. /**
  37. * Identifies spelling errors using Norvig's algorithm.
  38. */
  39. @Component(OperationType.SPELLING_CHECKER)
  40. @ResourceMetaData(name = "Simple Spelling Corrector")
  41. @Parameters(
  42. exclude = {
  43. NorvigSpellingCorrector.PARAM_MODEL_LOCATION })
  44. @TypeCapability(
  45. inputs = {
  46. "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"},
  47. outputs = {
  48. "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation"})
  49. public class NorvigSpellingCorrector
  50. extends JCasAnnotator_ImplBase
  51. {
  52. /**
  53. * Location from which the model is read. This is either a local path or a classpath location.
  54. * In the latter case, the model artifact (if any) is searched as well.
  55. */
  56. public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
  57. @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
  58. private String modelLocation;
  59. private NorvigSpellingAlgorithm spellingCorrector;
  60. @Override
  61. public void initialize(UimaContext context)
  62. throws ResourceInitializationException
  63. {
  64. super.initialize(context);
  65. try {
  66. spellingCorrector = new NorvigSpellingAlgorithm();
  67. spellingCorrector.train(getContext().getResourceURL(modelLocation), "UTF-8");
  68. }
  69. catch (Exception e) {
  70. throw new ResourceInitializationException(e);
  71. }
  72. }
  73. @Override
  74. public void process(JCas jcas)
  75. throws AnalysisEngineProcessException
  76. {
  77. for (Token t : select(jcas, Token.class)) {
  78. String token = t.getCoveredText();
  79. // If there is no spelling error in this token, then we do not
  80. // have to correct it.
  81. if (selectCovered(SpellingAnomaly.class, t).size() == 0) {
  82. continue; // No mistake here
  83. }
  84. String correction = spellingCorrector.correct(token);
  85. if (!correction.equals(token)) {
  86. // Create change annotation
  87. SofaChangeAnnotation change = new SofaChangeAnnotation(jcas, t.getBegin(),
  88. t.getEnd());
  89. change.setValue(correction);
  90. change.setReason("spelling error");
  91. change.setOperation("replace");
  92. change.addToIndexes();
  93. }
  94. }
  95. }
  96. }