HTMLHeadHandler.java - HTML META tag information used to de…

/eclipse-wtp-webservices-R3.4.0/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/HTMLHeadHandler.java

# · Java · 296 lines · 212 code · 24 blank · 60 comment · 35 complexity · 81e85af995faaa15f2f80463bd935da4 MD5 · raw file

/*******************************************************************************
 * Copyright (c) 2001, 2006 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 * IBM Corporation - initial API and implementation
 * yyyymmdd bug      Email and other contact information
 * -------- -------- -----------------------------------------------------------
 * 20060517   142324 rsinha@ca.ibm.com - Rupam Kuehner
 *******************************************************************************/

package org.eclipse.wst.ws.internal.parser.wsil;

import java.io.UnsupportedEncodingException;
import java.util.Vector;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;

public class HTMLHeadHandler extends DefaultHandler
{
  private final char START_TAG = '<';
  private final char END_TAG = '>';
  private final String HEAD_START_TAG = "<head>";
  private final String HEAD_END_TAG = "</head>";
  private final String ROOT_START_TAG = "<root>";
  private final String ROOT_END_TAG = "</root>";
  private final String UTF8 = "UTF-8";
  
  //HTML META tag information used to detect the charset.
  private final String HTML_CONTENT = "content";
  private final String HTTP_EQUIV = "http-equiv";
  private final String HTTP_EQUIV_CONTENT_TYPE = "Content-Type";
  private final String CHARSET = "charset";
  
  // WSIL tag information.
  private final String META = "meta";
  private final String NAME = "name";
  private final String SERVICE_INSPECTION = "serviceInspection";
  private final String CONTENT = "content";
  
  // DISCO tag information.
  private final String LINK = "link";
  private final String TYPE = "type";
  private final String TEXT_XML = "text/xml";
  private final String REL = "rel";
  private final String ALTERNATE = "alternate";
  private final String HREF = "href";
    
  private String baseURI_;
  private Vector wsils_;
  private Vector discos_;
  private String byteEncoding = UTF8; //Default to UTF-8.
  
  public HTMLHeadHandler(String baseURI)
  {
    super();
    baseURI_ = baseURI;
    wsils_ = new Vector();
    discos_ = new Vector();
  }
 
  public String[] getWsils()
  {
    String[] wsils = new String[wsils_.size()];
    wsils_.copyInto(wsils);
    return wsils;
  }
  
  public String[] getDiscos()
  {
    String[] discos = new String[discos_.size()];
    discos_.copyInto(discos);
    return discos;
  }
  
  public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
  {
    String qNameLC = qName.toLowerCase();
    if (qNameLC.equals(META))
    {
      String nameValue = attributes.getValue(NAME);
      if (SERVICE_INSPECTION.equals(nameValue))
      {
        String wsilURI = attributes.getValue(CONTENT);
        if (baseURI_ != null && wsilURI.indexOf(":/") == -1)
        {
          StringBuffer sb = new StringBuffer();
          sb.append(baseURI_.substring(0, baseURI_.lastIndexOf("/")+1));
          sb.append(wsilURI);
          wsilURI = sb.toString();
        }
        if (!wsils_.contains(wsilURI))
          wsils_.add(wsilURI);
      }
    }
    else if (qNameLC.equals(LINK))
    {
      // See http://msdn.microsoft.com/msdnmag/issues/02/02/xml/default.aspx for more details on DISCO.
      String type = attributes.getValue(TYPE);
      String rel = attributes.getValue(REL);
      String href = attributes.getValue(HREF);
      if (TEXT_XML.equals(type) && ALTERNATE.equals(rel) && href != null)
      {
        String discoURI = href;
        if (discoURI.indexOf(":/") == -1)
        {
          StringBuffer sb = new StringBuffer();
          sb.append(baseURI_.substring(0,baseURI_.lastIndexOf("/")+1));
          sb.append(discoURI);
          discoURI = sb.toString();
        }
        if (!discos_.contains(discoURI))
          discos_.add(discoURI);
      }
    }
  }
  
  public void error(SAXParseException e) throws SAXException
  {
  }
  
  public void fatalError(SAXParseException e) throws SAXException
  {
  }
  
  public void warning(SAXParseException e) throws SAXException
  {
  }
  
  /**
   * Appends the elements of the provided tag in the provided document to the provided StringBuffer.
   * @param target
   * @param document
   * @param tag
   * @param encoding
   * @return boolean false if the value of the encoding parameter matched the detected charset or if no charset was detected.
   * Returns true if a charset was detected and it did not equal the encoding parameter. If true is returned
   * the harvesting of the tags would have stopped at the point the charset was detected. The caller
   * should call this method again with the correct encoding.
   */
  private boolean harvestTags(StringBuffer target,String document,String tag, String encoding)
  {
	boolean changeEncoding = false;
    int index = document.indexOf(START_TAG);
    int documentLength = document.length();
    int tagLength = tag.length();
    while (index != -1 && (index+1+tagLength)<documentLength)
    {
      String str = document.substring(index+1,index+1+tagLength);
      if (str.toLowerCase().equals(tag))
      {
        str = document.substring(index,document.indexOf(END_TAG,index+1)+1);
        target.append(str);
        index += str.length();
        
        //If tag is META and declares the charset, find out what it is
        //and if it matches what was passed in. If it matches, continue 
        //with the parsing and return false when complete. 
        //If the detected charset is different from what was passed in, 
        //- change byteEncoding to equal the detected charset.
        //- stop parsing.
        //- return true.
        if (tag.equals(META))
        {
          int idxOfContent = str.indexOf(HTML_CONTENT);
          int idxOfHTTPEQUIV = str.indexOf(HTTP_EQUIV);
          if (idxOfHTTPEQUIV!= -1 && idxOfContent != -1)
          {
        	//Check if the http-equiv attribute is set to Content-Type.
          	int idxOfHTTPEQUIVOpenQuote = str.indexOf("\"", idxOfHTTPEQUIV+1);
        	int idxOfHTTPEQUIVClosingQuote = str.indexOf("\"", idxOfHTTPEQUIVOpenQuote+1);
        	String hTTPEQUIVValueUntrimmed = str.substring(idxOfHTTPEQUIVOpenQuote+1, idxOfHTTPEQUIVClosingQuote);
        	if (hTTPEQUIVValueUntrimmed.trim().equals(HTTP_EQUIV_CONTENT_TYPE))
        	{
        	  //This META tag contains the charset. Get the value of the content attribute
        	  int idxOfOpenQuote = str.indexOf("\"", idxOfContent+1);
        	  int idxOfClosingQuote = str.indexOf("\"", idxOfOpenQuote+1);
        	  String contentValue = str.substring(idxOfOpenQuote+1, idxOfClosingQuote);
        	  
        	  //Get the charset
        	  int idxOfCharSet = contentValue.indexOf(CHARSET);
        	  int idxOfEquals = contentValue.indexOf("=", idxOfCharSet+CHARSET.length());
        	  String detectedEncodingValueUntrimmed = contentValue.substring(idxOfEquals+1);
        	  String detectedEncodingValue = detectedEncodingValueUntrimmed.trim();
        	  if (!detectedEncodingValue.equals(encoding))
        	  {
        	    byteEncoding = detectedEncodingValue;
        	    changeEncoding = true;
        	    break;
        	  }
            }
          }
        }
      }
      else
        index++;
      index = document.indexOf(START_TAG,index);
    }
    
    return changeEncoding;
  }
  

  /**
   * If the provided byte array reperesents the contents of an HTML
   * document, this method will return a byte array in which
   * <ul>
   * <li>the opening and closing HEAD tags are removed and replaced with 
   * opening and closing <root> tags</li>
   * <li>only the META and LINK elements are in the HTML document
   * are included in the contents between the opening and closing
   * <root> tags.
   * </ul>
   * This method will modify the value of the byteEncoding String
   * attribute on this class if it is something other than
   * UTF-8. Callers of this method should call getByteEncoding()
   * after calling this method if they need to know the charset
   * value used by this method to decode/endcode the byte array.
   * @param b
   * @return byte[]
   */
  public byte[] harvestHeadTags(byte[] b)
  {
    String s;
    
    try
    {
    	//Assume the default byte encoding of UTF-8 for now.
    	s = new String(b, byteEncoding);
    }
    catch (UnsupportedEncodingException uee)
    {
      s = new String(b);
    }
    String head = s.toLowerCase();
    int headStartIndex = head.indexOf(HEAD_START_TAG);
    int headEndIndex = head.indexOf(HEAD_END_TAG);
    StringBuffer sb = new StringBuffer();
    sb.append(ROOT_START_TAG);
    if (headStartIndex != -1 && headEndIndex != -1)
    {
      head = s.substring(headStartIndex, headEndIndex+HEAD_END_TAG.length());
      boolean encodingChanged = harvestTags(sb,head,META, byteEncoding);
      if (encodingChanged)
      {
    	  //The harvestTags method detected a different charset
    	  //than the one that was passed in. Start from the beginning
    	  //with the correct charset.
    	    String s2;
    	    try
    	    {
    	    	s2 = new String(b, byteEncoding);
    	    }
    	    catch (UnsupportedEncodingException uee)
    	    {
    	      s2 = new String(b);
    	    }
    	    String head2 = s2.toLowerCase();
    	    int head2StartIndex = head2.indexOf(HEAD_START_TAG);
    	    int head2EndIndex = head2.indexOf(HEAD_END_TAG);
    	    sb = new StringBuffer();
    	    sb.append(ROOT_START_TAG);
    	    if (head2StartIndex != -1 && head2EndIndex != -1)
    	    {
    	      head2 = s2.substring(head2StartIndex, head2EndIndex+HEAD_END_TAG.length());
    	      harvestTags(sb,head2,META, byteEncoding);
    	      harvestTags(sb,head2,LINK,byteEncoding);
    	    }    	  
      }
      else
      {
        harvestTags(sb,head,LINK,byteEncoding);
      }
    }
    sb.append(ROOT_END_TAG);
    try
    {
    	return sb.toString().getBytes(byteEncoding);      
    } catch (UnsupportedEncodingException uee)
    {
      return sb.toString().getBytes();
    }    
    
  }
  
  public String getByteEncoding()
  {
	  return byteEncoding;
  }
}
Tech Fingerprint

Alerts (5)

Complexity hotspot; line 91 (total complexity: 4)
91
Complexity hotspot; line 108 (total complexity: 4)
108
'Set' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
187
Complexity hotspot; line 246 (total complexity: 4)
246
Complexity hotspot; line 269 (total complexity: 4)
269