PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/i18npool/source/isolang/lcid.awk

https://bitbucket.org/markjenkins/libreoffice_ubuntu-debian-fixes
AWK | 187 lines | 97 code | 11 blank | 79 comment | 0 complexity | ce13296d941a0036db05c92e6d239cd2 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-3.0, MPL-2.0-no-copyleft-exception, LGPL-2.1, BSD-3-Clause-No-Nuclear-License-2014
  1. #!/usr/bin/awk -f
  2. #
  3. # This file is part of the LibreOffice project.
  4. #
  5. # This Source Code Form is subject to the terms of the Mozilla Public
  6. # License, v. 2.0. If a copy of the MPL was not distributed with this
  7. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  8. #
  9. # This file incorporates work covered by the following license notice:
  10. #
  11. # Licensed to the Apache Software Foundation (ASF) under one or more
  12. # contributor license agreements. See the NOTICE file distributed
  13. # with this work for additional information regarding copyright
  14. # ownership. The ASF licenses this file to you under the Apache
  15. # License, Version 2.0 (the "License"); you may not use this file
  16. # except in compliance with the License. You may obtain a copy of
  17. # the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18. #
  19. # Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
  20. # Run in i18npool/source/isolang
  21. #
  22. # outputs new #define LANGUAGE_... 0x... and also some commented out substrings
  23. # that were matched in already existing defines.
  24. #
  25. # ATTENTION! The sed filter in the command line examples below assures that a
  26. # '|' border is drawn by html2text in data tables, and nowhere else, on which
  27. # this awk script relies. This script also heavily relies on the column layout
  28. # encountered. Should MS decide to change their layout or their CSS names
  29. # ("data..."), this would probably break. Should html2text decide that the last
  30. # border="..." attribute encountered wins instead of the first, this may break
  31. # also.
  32. #
  33. # sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
  34. #
  35. # After html2text best if file cleaned up to _only_ contain the table entries,
  36. # but not necessary, entries are filtered. Check output.
  37. #
  38. # Expects input from the saved page of one of
  39. #
  40. # (1)
  41. # http://www.microsoft.com/globaldev/reference/lcid-all.mspx
  42. # filtered through ``html2text -nobs ...'', generated table:
  43. # blank,name,hex,dec,blank fields:
  44. # |Afrikaans_-_South_Africa___|0436___|1078___|
  45. #
  46. # complete command line:
  47. # lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
  48. #
  49. #
  50. # (2)
  51. # http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
  52. # filtered through ``html2text -nobs ...'', generated table:
  53. # blank,name,hex,dec,inputlocales,collection,blank fields:
  54. # |Afrikaans |0436 |1078 |0436:00000409, |Basic |
  55. #
  56. # complete command line:
  57. # lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
  58. #
  59. #
  60. # (3)
  61. # http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
  62. # filtered through ``html2text -nobs ...'', generated table:
  63. # blank,hex,locale,name,blank fields:
  64. # |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
  65. #
  66. # complete command line:
  67. # lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
  68. #
  69. # Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
  70. #
  71. BEGIN {
  72. while ((getline < "../../inc/i18npool/lang.h") > 0)
  73. {
  74. if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
  75. {
  76. # lang[HEX]=NAME
  77. lang[toupper(substr($3,3))] = toupper($2)
  78. #print substr($3,3) "=" $2
  79. }
  80. }
  81. # html2text table follows
  82. FS = "\|"
  83. filetype = 0
  84. lcid_all = 1
  85. xp_lcid = 2
  86. nls_238z = 3
  87. filetypename[filetype] = "unknown"
  88. filetypename[lcid_all] = "lcid_all"
  89. filetypename[xp_lcid] = "xp_lcid"
  90. filetypename[nls_238z] = "nls_238z"
  91. namefield[lcid_all] = 2
  92. namefield[xp_lcid] = 2
  93. namefield[nls_238z] = 4
  94. hexfield[lcid_all] = 3
  95. hexfield[xp_lcid] = 3
  96. hexfield[nls_238z] = 2
  97. locfield[lcid_all] = 0
  98. locfield[xp_lcid] = 0
  99. locfield[nls_238z] = 3
  100. }
  101. (NF < 5) { next }
  102. !filetype {
  103. if (NF == 5)
  104. {
  105. if ($2 ~ /^0x/)
  106. filetype = nls_238z
  107. else if ($2 ~ /^Afrikaans/)
  108. filetype = lcid_all
  109. }
  110. else if (NF == 7)
  111. filetype = xp_lcid
  112. if (!filetype)
  113. next
  114. name = namefield[filetype]
  115. hex = hexfield[filetype]
  116. loc = locfield[filetype]
  117. }
  118. {
  119. gsub( /^[^:]*:/, "", $name)
  120. gsub( /\..*/, "", $name)
  121. gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
  122. gsub( /(^[ _]+)|([ _]+$)/, "", $name)
  123. if (loc)
  124. gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
  125. }
  126. ($hex ~ /^0x/) { $hex = substr( $hex, 3) }
  127. # if only 464 instead of 0464, make it match lang.h
  128. (length($hex) < 4) { $hex = "0" $hex }
  129. ($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }
  130. # all[HEX]=string
  131. { all[toupper($hex)] = $name }
  132. (loc) { comment[toupper($hex)] = " /* " $loc " */" }
  133. # new hex: newlang[HEX]=string
  134. !(toupper($hex) in lang) { newlang[toupper($hex)] = $name }
  135. END {
  136. if (!filetype)
  137. {
  138. print "No file type recognized." >>"/dev/stderr"
  139. exit(1)
  140. }
  141. print "// assuming " filetypename[filetype] " file"
  142. # every new language
  143. for (x in newlang)
  144. {
  145. printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
  146. n = split(newlang[x],arr,/[^A-Za-z0-9]/)
  147. def = ""
  148. for (i=1; i<=n; ++i)
  149. {
  150. if (length(arr[i]))
  151. {
  152. # each identifier word of the language name
  153. if (def)
  154. def = def "_"
  155. aup = toupper(arr[i])
  156. def = def aup
  157. for (l in lang)
  158. {
  159. # contained in already existing definitions?
  160. if (lang[l] ~ aup)
  161. printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
  162. }
  163. }
  164. }
  165. printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
  166. }
  167. print "\n// --- reverse check follows ----------------------------------\n"
  168. for (x in lang)
  169. {
  170. if (!(x in all))
  171. print "// not in input file: " x " " lang[x]
  172. }
  173. print "\n// --- filtered table entries follow (if any) -----------------\n"
  174. for (x in filtered)
  175. print "// filtered: " x " " filtered[x]
  176. }