/src/net/yacy/cider/document/Charset.java
https://github.com/yacy/cider · Java · 102 lines · 49 code · 14 blank · 39 comment · 27 complexity · 91597c58c1310cb5766400f804b773be MD5 · raw file
- /**
- * Charset.java
- * Copyright 2010 by Michael Peter Christen
- * First released 27.4.2010 at http://yacy.net
- *
- * This file is part of YaCy Content Integration
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program in the file COPYING.LESSER.
- * If not, see <http://www.gnu.org/licenses/>.
- */
- package net.yacy.cider.document;
- public class Charset {
-
- /**
- * some html authors use wrong encoding names, either because they don't know exactly what they
- * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy
- * This method patches wrong encoding names. The correct names are taken from
- * http://www.iana.org/assignments/character-sets
- * @param encoding
- * @return patched encoding name
- */
- public static String patchCharsetEncoding(String encoding) {
-
- // return the system default encoding
- if ((encoding == null) || (encoding.length() < 3)) return java.nio.charset.Charset.defaultCharset().name();
-
- // trim encoding string
- encoding = encoding.trim();
- // fix upper/lowercase
- encoding = encoding.toUpperCase();
- if (encoding.startsWith("SHIFT")) return "Shift_JIS";
- if (encoding.startsWith("BIG")) return "Big5";
- // all other names but such with "windows" use uppercase
- if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
- if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
-
- // fix wrong fill characters
- encoding = encoding.replaceAll("_", "-");
- if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
- if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
- if (encoding.startsWith("US")) return "US-ASCII";
- if (encoding.startsWith("KOI")) return "KOI8-R";
-
- // patch missing '-'
- if (encoding.startsWith("windows") && encoding.length() > 7) {
- final char c = encoding.charAt(7);
- if ((c >= '0') && (c <= '9')) {
- encoding = "windows-" + encoding.substring(7);
- }
- }
-
- if (encoding.startsWith("ISO")) {
- // patch typos
- if (encoding.length() > 3) {
- final char c = encoding.charAt(3);
- if ((c >= '0') && (c <= '9')) {
- encoding = "ISO-" + encoding.substring(3);
- }
- }
- if (encoding.length() > 8) {
- final char c = encoding.charAt(8);
- if ((c >= '0') && (c <= '9')) {
- encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
- }
- }
- }
-
- // patch wrong name
- if (encoding.startsWith("ISO-8559")) {
- // popular typo
- encoding = "ISO-8859" + encoding.substring(8);
- }
- // converting cp\d{4} -> windows-\d{4}
- if (encoding.matches("CP([_-])?125[0-8]")) {
- final char c = encoding.charAt(2);
- if ((c >= '0') && (c <= '9')) {
- encoding = "windows-" + encoding.substring(2);
- } else {
- encoding = "windows" + encoding.substring(2);
- }
- }
- return encoding;
- }
- }