/open-dm-dq/standardizer/other/scripts/inputPattern.awk
AWK | 201 lines | 181 code | 20 blank | 0 comment | 0 complexity | e3caf270db409c691e4533ab903c7472 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.0
- BEGIN {
- idx = 0
-
- inputType[""] = "UNSPECIFIED"
- inputType["1P"] = "UNKNOWN_1P"
- inputType["A1"] = "ALPHA_ONE"
- inputType["A2"] = "ALPHA_TWO"
- inputType["A3"] = "UNKNOWN_A3"
- inputType["AM"] = "AMPERSAND"
- inputType["AN"] = "ALPHA_NUM"
- inputType["AU"] = "GENERIC_WORD"
- inputType["B*"] = "BUILDING_CLASS"
- inputType["BI"] = "UNKNOWN_BI"
- inputType["BN"] = "UNKNOWN_BN"
- inputType["BP"] = "BUILDING_PROPERTY"
- inputType["BS"] = "UNKNOWN_BS"
- inputType["BT"] = "UNKNOWN_BT"
- inputType["BU"] = "BUILDING_UNIT"
- inputType["BX"] = "POST_OFFICE_BOX"
- inputType["CN"] = "UNKNOWN_CN"
- inputType["D1"] = "DIGIT"
- inputType["DA"] = "LEADING_DASH"
- inputType["DB"] = "UNKNOWN_DB"
- inputType["DM"] = "UNKNOWN_DM"
- inputType["DR"] = "STREET_DIRECTION"
- inputType["EI"] = "EXTRA_INFORMATION"
- inputType["EN"] = "UNKNOWN_EN"
- inputType["EX"] = "EXTENSION"
- inputType["FC"] = "NUMERIC_FRACTION"
- inputType["H*"] = "HOUSE_CLASS"
- inputType["HN"] = "UNKNOWN_HN"
- inputType["HR"] = "HIGHWAY_ROUTE"
- inputType["HS"] = "UNKNOWN_HS"
- inputType["MP"] = "MILE_POST"
- inputType["N*"] = "MOSTLY_NUMERIC_CLASS"
- inputType["NA"] = "UNKNOWN_NA"
- inputType["NB"] = "UNKNOWN_NB"
- inputType["NL"] = "COMMON_WORD"
- inputType["NU"] = "NUMERIC_VALUE"
- inputType["OT"] = "ORDINAL_TYPE"
- inputType["P*"] = "POST_OFFICE_BOX_CLASS"
- inputType["PD"] = "UNKNOWN_PD"
- inputType["PT"] = "PREFIX_TYPE"
- inputType["R*"] = "RURAL_ROUTE_CLASS"
- inputType["RR"] = "RURAL_ROUTE"
- inputType["SA"] = "STATE_ABBREVIATION"
- inputType["SD"] = "UNKNOWN_SD"
- inputType["ST"] = "UNKNOWN_ST"
- inputType["T*"] = "STREET_TYPE_CLASS"
- inputType["TB"] = "UNKNOWN_TB"
- inputType["TY"] = "STREET_TYPE"
- inputType["W*"] = "UNIT_WITHIN_STRUCTURE_CLASS"
- inputType["WD"] = "STRUCTURE_DESCRIPTOR"
- inputType["WI"] = "STRUCTURE_IDENTIFIER"
- inputType["XN"] = "UNKNOWN_XN"
-
- outputType["1P"] = "BUILDING_NUMBER_PREFIX"
- outputType["2P"] = "SECOND_BUILDING_NUMBER_PREFIX"
- outputType["B*"] = "BUILDING_CLASS"
- outputType["B1"] = "UNKNOWN_B1"
- outputType["B2"] = "UNKNOWN_B2"
- outputType["BD"] = "PROPERTY_SUFFIX_DIRECTION"
- outputType["BE"] = "UNKNOWN_BE"
- outputType["BI"] = "STRUCTURE_IDENTIFIER"
- outputType["BN"] = "PROPERTY_BUILDING_NAME"
- outputType["BS"] = "BUILDING_NUMBER_SUFFIX"
- outputType["BT"] = "PROPERTY_TYPE_SUFFIX"
- outputType["BX"] = "POST_OFFICE_BOX_DESCRIPTOR"
- outputType["BY"] = "STRUCTURE_DESCRIPTOR"
- outputType["DB"] = "PROPERTY_PREFIX_DIRECTION"
- outputType["EI"] = "EXTRA_INFORMATION"
- outputType["EX"] = "STREET_NAME_EXTENSION_INDEX"
- outputType["H*"] = "HOUSE_CLASS"
- outputType["H1"] = "FIRST_HOUSE_NUMBER"
- outputType["H2"] = "SECOND_HOUSE_NUMBER"
- outputType["HN"] = "HOUSE_NUMBER"
- outputType["HS"] = "HOUSE_NUMBER_SUFFIX"
- outputType["MN"] = "MATCHING_PROPERTY_NAME"
- outputType["MS"] = "MATCHING_STREET_NAME"
- outputType["N*"] = "MOSTLY_NUMERIC_CLASS"
- outputType["N2"] = "SECOND_STREET_NAME"
- outputType["NA"] = "STREET_NAME"
- outputType["NB"] = "BUILDING_NUMBER"
- outputType["NL"] = "CONJUNCTION"
- outputType["ON"] = "ORIGINAL_PROPERTY_NAME"
- outputType["ON"] = "ORIGINAL_SECOND_STREET_NAME"
- outputType["OS"] = "ORIGINAL_STREET_NAME"
- outputType["P*"] = "POST_OFFICE_BOX_CLASS"
- outputType["P1"] = "HOUSE_NUMBER_PREFIX"
- outputType["P2"] = "SECOND_HOUSE_NUMBER_PREFIX"
- outputType["PD"] = "STREET_NAME_PREFIX_DIRECTION"
- outputType["PT"] = "STREET_NAME_PREFIX_TYPE"
- outputType["R*"] = "RURAL_ROUTE_CLASS"
- outputType["RN"] = "RURAL_ROUTE_IDENTIFIER"
- outputType["RR"] = "RURAL_ROUTE_DESCRIPTOR"
- outputType["S2"] = "SECOND_STREET_NAME_SUFFIX_TYPE_S2"
- outputType["SD"] = "STREET_NAME_SUFFIX_DIRECTION"
- outputType["SN"] = "STORAGE_STREET_NAME"
- outputType["ST"] = "STREET_NAME_SUFFIX_TYPE"
- outputType["T*"] = "STREET_TYPE_CLASS"
- outputType["T2"] = "SECOND_STREET_NAME_SUFFIX_TYPE_T2"
- outputType["TB"] = "PROPERTY_TYPE_PREFIX"
- outputType["W*"] = "UNIT_WITHIN_STRUCTURE_CLASS"
- outputType["WD"] = "WITHIN_STRUCTURE_DESCRIPTOR"
- outputType["WI"] = "WITHIN_STRUCTURE_IDENTIFIER"
- outputType["XN"] = "POST_OFFICE_BOX_IDENTIFIER"
-
- patternClass["H"] = "HOUSE"
- patternClass["B"] = "BUILDING"
- patternClass["W"] = "UNIT_WITHIN_STRUCTURE"
- patternClass["T"] = "STREET_TYPE"
- patternClass["R"] = "RURAL_ROUTE"
- patternClass["P"] = "POST_OFFICE_BOX"
- patternClass["N"] = "MOSTLY_NUMERIC"
- }
- /^ *$/ {next}
- {
- inputTokenList = substr($0, 1, 35)
- gsub(/^ +/, "", inputTokenList)
- gsub(/ +$/, "", inputTokenList)
-
- example = substr($0, 38)
- gsub(/^ +/, "", example)
- gsub(/ +$/, "", example)
- gsub(/\&/, "\\&", example)
- gsub(/</, "\\<", example)
- gsub(/>/, "\\>", example)
-
- if (getline == 0) {
- print "Unexpected EOF: " + FILENAME > "/dev/stderr"
- exit
- }
-
- line = sprintf("%-59s", $0) " " example
- excluded = substr(line, 59, 1)
- if (excluded == " ") {
- lines[++idx] = line
- if (inputTokenList in records) {
- records[inputTokenList] = records[inputTokenList] "," idx
- } else {
- records[inputTokenList] = idx
- }
- }
- }
- END {
- print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
- print "<patterns>"
-
- for (inputTokenList in records) {
- cnt = split(records[inputTokenList], indexes, ",")
- for (i = 1; i <= cnt; i++) {
- line = lines[indexes[i]]
-
- usageFlag = substr(line, 58, 1)
- if (usageFlag == " ")
- break
- }
-
- outputTokenList = substr(line, 1, 35)
- gsub(/^ +/, "", outputTokenList)
- gsub(/ +$/, "", outputTokenList)
-
- class = substr(line, 38, 1)
-
- scoringOption = substr(line, 39, 1)
- if (scoringOption == "*")
- scoringOption = "AVERAGE"
- else
- scoringOption = "NO_AVERAGE"
-
- priority = substr(line, 40, 3)
- gsub(/^ +/, "", priority)
- gsub(/ +$/, "", priority)
-
- example = substr(line, 82)
-
- inputTokenCount = split(inputTokenList, inputTokens)
- outputTokenCount = split(outputTokenList, outputTokens)
-
- print " <pattern>"
-
- print " <inputTokens>"
- for (i = 1; i <= inputTokenCount; i++)
- print " <inputToken>" inputType[inputTokens[i]] "</inputToken>"
- print " </inputTokens>"
-
- print " <outputTokens>"
- for (i = 1; i <= outputTokenCount; i++)
- print " <outputToken>" outputType[outputTokens[i]] "</outputToken>"
- print " </outputTokens>"
-
- print " <example>" example "</example>"
- print " <priority>" priority "</priority>"
- print " <patternClass>" patternClass[class] "</patternClass>"
- print " <scoringOption>" scoringOption "</scoringOption>"
-
- print " </pattern>"
- }
-
- print "</patterns>"
- }