PageRenderTime 75ms CodeModel.GetById 32ms RepoModel.GetById 0ms app.codeStats 0ms

/open-dm-dq/standardizer/other/scripts/generateClueTableFR.awk

https://bitbucket.org/pymma/mosaic
AWK | 164 lines | 147 code | 17 blank | 0 comment | 0 complexity | 6dbf16982e3cc4458ae221b73c1aaed3 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.0
  1. BEGIN {
  2. inputType[""] = "UNSPECIFIED"
  3. inputType["1P"] = "UNKNOWN_1P"
  4. inputType["A1"] = "ALPHA_ONE"
  5. inputType["A2"] = "ALPHA_TWO"
  6. inputType["A3"] = "UNKNOWN_A3"
  7. inputType["AM"] = "AMPERSAND"
  8. inputType["AN"] = "ALPHA_NUM"
  9. inputType["AU"] = "GENERIC_WORD"
  10. inputType["B*"] = "UNKNOWN_B_STAR"
  11. inputType["B+"] = "UNKNOWN_B_PLUS"
  12. inputType["BI"] = "UNKNOWN_BI"
  13. inputType["BN"] = "UNKNOWN_BN"
  14. inputType["BP"] = "BUILDING_PROPERTY"
  15. inputType["BS"] = "UNKNOWN_BS"
  16. inputType["BT"] = "UNKNOWN_BT"
  17. inputType["BU"] = "BUILDING_UNIT"
  18. inputType["BX"] = "POST_OFFICE_BOX"
  19. inputType["CN"] = "UNKNOWN_CN"
  20. inputType["D1"] = "DIGIT"
  21. inputType["DA"] = "LEADING_DASH"
  22. inputType["DB"] = "UNKNOWN_DB"
  23. inputType["DM"] = "UNKNOWN_DM"
  24. inputType["DR"] = "STREET_DIRECTION"
  25. inputType["EI"] = "EXTRA_INFORMATION"
  26. inputType["EN"] = "UNKNOWN_EN"
  27. inputType["EX"] = "EXTENSION"
  28. inputType["FC"] = "NUMERIC_FRACTION"
  29. inputType["H*"] = "UNKNOWN_H_STAR"
  30. inputType["H+"] = "UNKNOWN_H_PLUS"
  31. inputType["HN"] = "UNKNOWN_HN"
  32. inputType["HR"] = "HIGHWAY_ROUTE"
  33. inputType["HS"] = "UNKNOWN_HS"
  34. inputType["MP"] = "MILE_POST"
  35. inputType["N*"] = "UNKNOWN_N_STAR"
  36. inputType["N+"] = "UNKNOWN_N_PLUS"
  37. inputType["NA"] = "UNKNOWN_NA"
  38. inputType["NB"] = "UNKNOWN_NB"
  39. inputType["NL"] = "COMMON_WORD"
  40. inputType["NU"] = "NUMERIC_VALUE"
  41. inputType["OT"] = "ORDINAL_TYPE"
  42. inputType["P*"] = "UNKNOWN_P_STAR"
  43. inputType["P+"] = "UNKNOWN_P_PLUS"
  44. inputType["PD"] = "UNKNOWN_PD"
  45. inputType["PT"] = "PREFIX_TYPE"
  46. inputType["R*"] = "UNKNOWN_R_STAR"
  47. inputType["R+"] = "UNKNOWN_R_PLUS"
  48. inputType["RR"] = "RURAL_ROUTE"
  49. inputType["SA"] = "STATE_ABBREVIATION"
  50. inputType["SD"] = "UNKNOWN_SD"
  51. inputType["ST"] = "UNKNOWN_ST"
  52. inputType["T*"] = "UNKNOWN_T_STAR"
  53. inputType["T+"] = "UNKNOWN_T_PLUS"
  54. inputType["TB"] = "UNKNOWN_TB"
  55. inputType["TY"] = "STREET_TYPE"
  56. inputType["W*"] = "UNKNOWN_W_STAR"
  57. inputType["W+"] = "UNKNOWN_W_PLUS"
  58. inputType["WD"] = "STRUCTURE_DESCRIPTOR"
  59. inputType["WI"] = "STRUCTURE_IDENTIFIER"
  60. inputType["XN"] = "UNKNOWN_XN"
  61. print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
  62. print "<clues>"
  63. }
  64. {
  65. line = sprintf("%-99s", $0)
  66. name = substr(line, 1, 34)
  67. gsub(/\&/, "&amp;", name)
  68. gsub(/^ */, "", name)
  69. gsub(/ *$/, "", name)
  70. translation = substr(line, 35, 13)
  71. gsub(/\&/, "&amp;", translation)
  72. gsub(/^ */, "", translation)
  73. gsub(/ *$/, "", translation)
  74. translationExpanded = "false"
  75. if (substr(line, 98, 1) == "*")
  76. translationExpanded = "true"
  77. clueWordId1 = substr(line, 51, 4)
  78. gsub(/\&/, "&amp;", clueWordId1)
  79. gsub(/^ */, "", clueWordId1)
  80. gsub(/ *$/, "", clueWordId1)
  81. clueType1 = substr(line, 55, 2)
  82. gsub(/\&/, "&amp;", clueType1)
  83. clueWordId2 = substr(line, 57, 4)
  84. gsub(/\&/, "&amp;", clueWordId2)
  85. gsub(/^ */, "", clueWordId2)
  86. gsub(/ *$/, "", clueWordId2)
  87. clueType2 = substr(line, 61, 2)
  88. gsub(/\&/, "&amp;", clueType2)
  89. clueWordId3 = substr(line, 63, 4)
  90. gsub(/\&/, "&amp;", clueWordId3)
  91. gsub(/^ */, "", clueWordId3)
  92. gsub(/ *$/, "", clueWordId3)
  93. clueType3 = substr(line, 67, 2)
  94. gsub(/\&/, "&amp;", clueType3)
  95. clueWordId4 = substr(line, 69, 4)
  96. gsub(/\&/, "&amp;", clueWordId4)
  97. gsub(/^ */, "", clueWordId4)
  98. gsub(/ *$/, "", clueWordId4)
  99. clueType4 = substr(line, 73, 2)
  100. gsub(/\&/, "&amp;", clueType4)
  101. clueWordId5 = substr(line, 75, 4)
  102. gsub(/\&/, "&amp;", clueWordId5)
  103. gsub(/^ */, "", clueWordId5)
  104. gsub(/ *$/, "", clueWordId5)
  105. clueType5 = substr(line, 79, 2)
  106. gsub(/\&/, "&amp;", clueType5)
  107. print " <clue>"
  108. print " <name>" name "</name>"
  109. print " <translation>" translation "</translation>"
  110. print " <translationExpanded>" translationExpanded "</translationExpanded>"
  111. print " <words>"
  112. if (clueWordId1 != "") {
  113. print " <word>"
  114. print " <id>" clueWordId1 "</id>"
  115. print " <type>" inputType[clueType1] "</type>"
  116. print " </word>"
  117. }
  118. if (clueWordId2 != "") {
  119. print " <word>"
  120. print " <id>" clueWordId2 "</id>"
  121. print " <type>" inputType[clueType2] "</type>"
  122. print " </word>"
  123. }
  124. if (clueWordId3 != "") {
  125. print " <word>"
  126. print " <id>" clueWordId3 "</id>"
  127. print " <type>" inputType[clueType3] "</type>"
  128. print " </word>"
  129. }
  130. if (clueWordId4 != "") {
  131. print " <word>"
  132. print " <id>" clueWordId4 "</id>"
  133. print " <type>" inputType[clueType4] "</type>"
  134. print " </word>"
  135. }
  136. if (clueWordId5 != "") {
  137. print " <word>"
  138. print " <id>" clueWordId5 "</id>"
  139. print " <type>" inputType[clueType5] "</type>"
  140. print " </word>"
  141. }
  142. print " </words>"
  143. print " </clue>"
  144. }
  145. END {
  146. print "</clues>"
  147. }