PageRenderTime 74ms CodeModel.GetById 46ms RepoModel.GetById 0ms app.codeStats 0ms

/src/net/yacy/cider/document/Charset.java

https://github.com/yacy/cider
Java | 102 lines | 49 code | 14 blank | 39 comment | 27 complexity | 91597c58c1310cb5766400f804b773be MD5 | raw file
  1. /**
  2. * Charset.java
  3. * Copyright 2010 by Michael Peter Christen
  4. * First released 27.4.2010 at http://yacy.net
  5. *
  6. * This file is part of YaCy Content Integration
  7. *
  8. * This program is free software: you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation, either version 3 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public License
  19. * along with this program in the file COPYING.LESSER.
  20. * If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. package net.yacy.cider.document;
  23. public class Charset {
  24. /**
  25. * some html authors use wrong encoding names, either because they don't know exactly what they
  26. * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy
  27. * This method patches wrong encoding names. The correct names are taken from
  28. * http://www.iana.org/assignments/character-sets
  29. * @param encoding
  30. * @return patched encoding name
  31. */
  32. public static String patchCharsetEncoding(String encoding) {
  33. // return the system default encoding
  34. if ((encoding == null) || (encoding.length() < 3)) return java.nio.charset.Charset.defaultCharset().name();
  35. // trim encoding string
  36. encoding = encoding.trim();
  37. // fix upper/lowercase
  38. encoding = encoding.toUpperCase();
  39. if (encoding.startsWith("SHIFT")) return "Shift_JIS";
  40. if (encoding.startsWith("BIG")) return "Big5";
  41. // all other names but such with "windows" use uppercase
  42. if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
  43. if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
  44. // fix wrong fill characters
  45. encoding = encoding.replaceAll("_", "-");
  46. if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
  47. if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
  48. if (encoding.startsWith("US")) return "US-ASCII";
  49. if (encoding.startsWith("KOI")) return "KOI8-R";
  50. // patch missing '-'
  51. if (encoding.startsWith("windows") && encoding.length() > 7) {
  52. final char c = encoding.charAt(7);
  53. if ((c >= '0') && (c <= '9')) {
  54. encoding = "windows-" + encoding.substring(7);
  55. }
  56. }
  57. if (encoding.startsWith("ISO")) {
  58. // patch typos
  59. if (encoding.length() > 3) {
  60. final char c = encoding.charAt(3);
  61. if ((c >= '0') && (c <= '9')) {
  62. encoding = "ISO-" + encoding.substring(3);
  63. }
  64. }
  65. if (encoding.length() > 8) {
  66. final char c = encoding.charAt(8);
  67. if ((c >= '0') && (c <= '9')) {
  68. encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
  69. }
  70. }
  71. }
  72. // patch wrong name
  73. if (encoding.startsWith("ISO-8559")) {
  74. // popular typo
  75. encoding = "ISO-8859" + encoding.substring(8);
  76. }
  77. // converting cp\d{4} -> windows-\d{4}
  78. if (encoding.matches("CP([_-])?125[0-8]")) {
  79. final char c = encoding.charAt(2);
  80. if ((c >= '0') && (c <= '9')) {
  81. encoding = "windows-" + encoding.substring(2);
  82. } else {
  83. encoding = "windows" + encoding.substring(2);
  84. }
  85. }
  86. return encoding;
  87. }
  88. }