PageRenderTime 46ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 1ms

/open-dm-dq/standardizer/other/scripts/inputPattern.awk

https://bitbucket.org/pymma/mosaic
AWK | 201 lines | 181 code | 20 blank | 0 comment | 0 complexity | e3caf270db409c691e4533ab903c7472 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.0
  1. BEGIN {
  2. idx = 0
  3. inputType[""] = "UNSPECIFIED"
  4. inputType["1P"] = "UNKNOWN_1P"
  5. inputType["A1"] = "ALPHA_ONE"
  6. inputType["A2"] = "ALPHA_TWO"
  7. inputType["A3"] = "UNKNOWN_A3"
  8. inputType["AM"] = "AMPERSAND"
  9. inputType["AN"] = "ALPHA_NUM"
  10. inputType["AU"] = "GENERIC_WORD"
  11. inputType["B*"] = "BUILDING_CLASS"
  12. inputType["BI"] = "UNKNOWN_BI"
  13. inputType["BN"] = "UNKNOWN_BN"
  14. inputType["BP"] = "BUILDING_PROPERTY"
  15. inputType["BS"] = "UNKNOWN_BS"
  16. inputType["BT"] = "UNKNOWN_BT"
  17. inputType["BU"] = "BUILDING_UNIT"
  18. inputType["BX"] = "POST_OFFICE_BOX"
  19. inputType["CN"] = "UNKNOWN_CN"
  20. inputType["D1"] = "DIGIT"
  21. inputType["DA"] = "LEADING_DASH"
  22. inputType["DB"] = "UNKNOWN_DB"
  23. inputType["DM"] = "UNKNOWN_DM"
  24. inputType["DR"] = "STREET_DIRECTION"
  25. inputType["EI"] = "EXTRA_INFORMATION"
  26. inputType["EN"] = "UNKNOWN_EN"
  27. inputType["EX"] = "EXTENSION"
  28. inputType["FC"] = "NUMERIC_FRACTION"
  29. inputType["H*"] = "HOUSE_CLASS"
  30. inputType["HN"] = "UNKNOWN_HN"
  31. inputType["HR"] = "HIGHWAY_ROUTE"
  32. inputType["HS"] = "UNKNOWN_HS"
  33. inputType["MP"] = "MILE_POST"
  34. inputType["N*"] = "MOSTLY_NUMERIC_CLASS"
  35. inputType["NA"] = "UNKNOWN_NA"
  36. inputType["NB"] = "UNKNOWN_NB"
  37. inputType["NL"] = "COMMON_WORD"
  38. inputType["NU"] = "NUMERIC_VALUE"
  39. inputType["OT"] = "ORDINAL_TYPE"
  40. inputType["P*"] = "POST_OFFICE_BOX_CLASS"
  41. inputType["PD"] = "UNKNOWN_PD"
  42. inputType["PT"] = "PREFIX_TYPE"
  43. inputType["R*"] = "RURAL_ROUTE_CLASS"
  44. inputType["RR"] = "RURAL_ROUTE"
  45. inputType["SA"] = "STATE_ABBREVIATION"
  46. inputType["SD"] = "UNKNOWN_SD"
  47. inputType["ST"] = "UNKNOWN_ST"
  48. inputType["T*"] = "STREET_TYPE_CLASS"
  49. inputType["TB"] = "UNKNOWN_TB"
  50. inputType["TY"] = "STREET_TYPE"
  51. inputType["W*"] = "UNIT_WITHIN_STRUCTURE_CLASS"
  52. inputType["WD"] = "STRUCTURE_DESCRIPTOR"
  53. inputType["WI"] = "STRUCTURE_IDENTIFIER"
  54. inputType["XN"] = "UNKNOWN_XN"
  55. outputType["1P"] = "BUILDING_NUMBER_PREFIX"
  56. outputType["2P"] = "SECOND_BUILDING_NUMBER_PREFIX"
  57. outputType["B*"] = "BUILDING_CLASS"
  58. outputType["B1"] = "UNKNOWN_B1"
  59. outputType["B2"] = "UNKNOWN_B2"
  60. outputType["BD"] = "PROPERTY_SUFFIX_DIRECTION"
  61. outputType["BE"] = "UNKNOWN_BE"
  62. outputType["BI"] = "STRUCTURE_IDENTIFIER"
  63. outputType["BN"] = "PROPERTY_BUILDING_NAME"
  64. outputType["BS"] = "BUILDING_NUMBER_SUFFIX"
  65. outputType["BT"] = "PROPERTY_TYPE_SUFFIX"
  66. outputType["BX"] = "POST_OFFICE_BOX_DESCRIPTOR"
  67. outputType["BY"] = "STRUCTURE_DESCRIPTOR"
  68. outputType["DB"] = "PROPERTY_PREFIX_DIRECTION"
  69. outputType["EI"] = "EXTRA_INFORMATION"
  70. outputType["EX"] = "STREET_NAME_EXTENSION_INDEX"
  71. outputType["H*"] = "HOUSE_CLASS"
  72. outputType["H1"] = "FIRST_HOUSE_NUMBER"
  73. outputType["H2"] = "SECOND_HOUSE_NUMBER"
  74. outputType["HN"] = "HOUSE_NUMBER"
  75. outputType["HS"] = "HOUSE_NUMBER_SUFFIX"
  76. outputType["MN"] = "MATCHING_PROPERTY_NAME"
  77. outputType["MS"] = "MATCHING_STREET_NAME"
  78. outputType["N*"] = "MOSTLY_NUMERIC_CLASS"
  79. outputType["N2"] = "SECOND_STREET_NAME"
  80. outputType["NA"] = "STREET_NAME"
  81. outputType["NB"] = "BUILDING_NUMBER"
  82. outputType["NL"] = "CONJUNCTION"
  83. outputType["ON"] = "ORIGINAL_PROPERTY_NAME"
  84. outputType["ON"] = "ORIGINAL_SECOND_STREET_NAME"
  85. outputType["OS"] = "ORIGINAL_STREET_NAME"
  86. outputType["P*"] = "POST_OFFICE_BOX_CLASS"
  87. outputType["P1"] = "HOUSE_NUMBER_PREFIX"
  88. outputType["P2"] = "SECOND_HOUSE_NUMBER_PREFIX"
  89. outputType["PD"] = "STREET_NAME_PREFIX_DIRECTION"
  90. outputType["PT"] = "STREET_NAME_PREFIX_TYPE"
  91. outputType["R*"] = "RURAL_ROUTE_CLASS"
  92. outputType["RN"] = "RURAL_ROUTE_IDENTIFIER"
  93. outputType["RR"] = "RURAL_ROUTE_DESCRIPTOR"
  94. outputType["S2"] = "SECOND_STREET_NAME_SUFFIX_TYPE_S2"
  95. outputType["SD"] = "STREET_NAME_SUFFIX_DIRECTION"
  96. outputType["SN"] = "STORAGE_STREET_NAME"
  97. outputType["ST"] = "STREET_NAME_SUFFIX_TYPE"
  98. outputType["T*"] = "STREET_TYPE_CLASS"
  99. outputType["T2"] = "SECOND_STREET_NAME_SUFFIX_TYPE_T2"
  100. outputType["TB"] = "PROPERTY_TYPE_PREFIX"
  101. outputType["W*"] = "UNIT_WITHIN_STRUCTURE_CLASS"
  102. outputType["WD"] = "WITHIN_STRUCTURE_DESCRIPTOR"
  103. outputType["WI"] = "WITHIN_STRUCTURE_IDENTIFIER"
  104. outputType["XN"] = "POST_OFFICE_BOX_IDENTIFIER"
  105. patternClass["H"] = "HOUSE"
  106. patternClass["B"] = "BUILDING"
  107. patternClass["W"] = "UNIT_WITHIN_STRUCTURE"
  108. patternClass["T"] = "STREET_TYPE"
  109. patternClass["R"] = "RURAL_ROUTE"
  110. patternClass["P"] = "POST_OFFICE_BOX"
  111. patternClass["N"] = "MOSTLY_NUMERIC"
  112. }
  113. /^ *$/ {next}
  114. {
  115. inputTokenList = substr($0, 1, 35)
  116. gsub(/^ +/, "", inputTokenList)
  117. gsub(/ +$/, "", inputTokenList)
  118. example = substr($0, 38)
  119. gsub(/^ +/, "", example)
  120. gsub(/ +$/, "", example)
  121. gsub(/\&/, "\\&", example)
  122. gsub(/</, "\\&lt;", example)
  123. gsub(/>/, "\\&gt;", example)
  124. if (getline == 0) {
  125. print "Unexpected EOF: " + FILENAME > "/dev/stderr"
  126. exit
  127. }
  128. line = sprintf("%-59s", $0) " " example
  129. excluded = substr(line, 59, 1)
  130. if (excluded == " ") {
  131. lines[++idx] = line
  132. if (inputTokenList in records) {
  133. records[inputTokenList] = records[inputTokenList] "," idx
  134. } else {
  135. records[inputTokenList] = idx
  136. }
  137. }
  138. }
  139. END {
  140. print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
  141. print "<patterns>"
  142. for (inputTokenList in records) {
  143. cnt = split(records[inputTokenList], indexes, ",")
  144. for (i = 1; i <= cnt; i++) {
  145. line = lines[indexes[i]]
  146. usageFlag = substr(line, 58, 1)
  147. if (usageFlag == " ")
  148. break
  149. }
  150. outputTokenList = substr(line, 1, 35)
  151. gsub(/^ +/, "", outputTokenList)
  152. gsub(/ +$/, "", outputTokenList)
  153. class = substr(line, 38, 1)
  154. scoringOption = substr(line, 39, 1)
  155. if (scoringOption == "*")
  156. scoringOption = "AVERAGE"
  157. else
  158. scoringOption = "NO_AVERAGE"
  159. priority = substr(line, 40, 3)
  160. gsub(/^ +/, "", priority)
  161. gsub(/ +$/, "", priority)
  162. example = substr(line, 82)
  163. inputTokenCount = split(inputTokenList, inputTokens)
  164. outputTokenCount = split(outputTokenList, outputTokens)
  165. print " <pattern>"
  166. print " <inputTokens>"
  167. for (i = 1; i <= inputTokenCount; i++)
  168. print " <inputToken>" inputType[inputTokens[i]] "</inputToken>"
  169. print " </inputTokens>"
  170. print " <outputTokens>"
  171. for (i = 1; i <= outputTokenCount; i++)
  172. print " <outputToken>" outputType[outputTokens[i]] "</outputToken>"
  173. print " </outputTokens>"
  174. print " <example>" example "</example>"
  175. print " <priority>" priority "</priority>"
  176. print " <patternClass>" patternClass[class] "</patternClass>"
  177. print " <scoringOption>" scoringOption "</scoringOption>"
  178. print " </pattern>"
  179. }
  180. print "</patterns>"
  181. }