CocoLexer.java | searchcode

/ojc-core/encodersl/encoder-coco/src/com/sun/encoder/coco/model/CocoLexer.java

https://bitbucket.org/pymma/openesb-components
Java | 629 lines | 349 code | 51 blank | 229 comment | 80 complexity | ec950c0d1b26424343e1113f483c0a85 MD5 | raw file

/*
 * BEGIN_HEADER - DO NOT EDIT
 * 
 * The contents of this file are subject to the terms
 * of the Common Development and Distribution License
 * (the "License").  You may not use this file except
 * in compliance with the License.
 *
 * You can obtain a copy of the license at
 * https://open-jbi-components.dev.java.net/public/CDDLv1.0.html.
 * See the License for the specific language governing
 * permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL
 * HEADER in each file and include the License file at
 * https://open-jbi-components.dev.java.net/public/CDDLv1.0.html.
 * If applicable add the following below this CDDL HEADER,
 * with the fields enclosed by brackets "[]" replaced with
 * your own identifying information: Portions Copyright
 * [year] [name of copyright owner]
 */

/*
 * @(#)CocoLexer.java 
 *
 * Copyright 2004-2007 Sun Microsystems, Inc. All Rights Reserved.
 * 
 * END_HEADER - DO NOT EDIT
 */

package com.sun.encoder.coco.model;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

/**
 * Tokenizer for Cobol Copybook input.
 *
 * @author Noel Ang
 *
 */
public final class CocoLexer {

    private int mRecordRow;
    private int mRecordCol;
    private ArrayList mUndoTokens;
    private final PushbackReader mInputReader;
    private boolean mIsDisable72ColumnLimit;

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        // sb.append("CocoLexer@").append(Integer.toHexString(hashCode()));
        sb.append("row=").append(mRecordRow);
        sb.append(" col=").append(mRecordCol);
        if (mIsDisable72ColumnLimit)
            sb.append(" Col>72OK");
        else
            sb.append(" Col<=72");
        if (mUndoTokens != null && mUndoTokens.size() > 0) {
            sb.append(" undoTokens=").append(mUndoTokens);
        }
        return sb.toString();
    }

    /**
     * Create a Cobol Copybook tokenizer for an input source.
     *
     * @param file The input source
     *
     * @throws FileNotFoundException if file does not point to an existing,
     *                               readable file
     */
    public CocoLexer(File file) throws FileNotFoundException {
        mInputReader = new PushbackReader(
                new InputStreamReader(new FileInputStream(file)),
                10);
        mRecordRow = 1;
        mRecordCol = 1;
        mUndoTokens = new ArrayList();
    }

    /**
     * Create a Cobol Copybook tokenizer for an input source.
     *
     * @param file     The input source
     * @param encoding Encoding of the input source
     *
     * @throws FileNotFoundException        if file does not point to an
     *                                      existing, readable file
     * @throws UnsupportedEncodingException if the specified encoding is not
     *                                      supported
     */
    public CocoLexer(File file, String encoding)
            throws FileNotFoundException, UnsupportedEncodingException {
        mInputReader = new PushbackReader(
                new InputStreamReader(new FileInputStream(file), encoding),
                10);
        mRecordRow = 1;
        mRecordCol = 1;
        mUndoTokens = new ArrayList();
    }

    /**
     * Provide hint to the lexer that it will no longer be used, and thus it
     * may release resources.  After calling dispose(), undefined behaviour will
     * result if the lexer continues to be used.
     */ 
    public void dispose() {
        mUndoTokens.clear();
        try {
            mInputReader.close();
        } catch (IOException e) {
            // do nothing
        }
    }

    /**
     * Disable lexer compliance with IBM Cobol constraint of 72-column copybooks.
     * When disabled, the lexer allows copybook "Area B" content to extend past
     * column 72.
     *
     * @param b <code>true</code> to disable the 72-column constraint,
     *                  <code>false</code> to (re)enable it.
     */
    public void setDisable72ColumnLimit(boolean b) {
        mIsDisable72ColumnLimit = b;
    }

    /**
     * Indicate whether or not the lexer is configured to enforce the IBM Cobol
     * constraint of 72-column copybook content.
     *
     * @return <code>true</code> if the constraint is enforced, <code>false</code>
     *         if it is not.
     *
     * @see #setDisable72ColumnLimit
     */
    public boolean is72ColumnLimitEnforced() {
        return !mIsDisable72ColumnLimit;
    }

    /**
     * Get the next token from the input.
     *
     * @return Next token, or null if no more tokens available
     * @throws java.io.IOException if an I/O error event occurs; note that this is
     *         distinguishable from an EOF/EOD event!
     */
    public CocoToken getNextToken() throws IOException {
        // check if undoTokens has something
        if (mUndoTokens.size() > 0) {
            int idx = mUndoTokens.size() - 1;
            CocoToken undoToken = (CocoToken) mUndoTokens.remove(idx);
            return undoToken;
        }

        /*
         * The Cobol language has characters that serve dual purposes. For
         * example, the letter G at the head of sequence could yield a Cobol word
         * or a separator (DBCS literal opening delimiter: G").  Therefore it is not
         * sufficient to assume exclusive classifications of every scanned
         * character ...
         *
         * Keep this in mind ...
         */

        CocoToken token = null;
        int bytefat4;

        try {
            bytefat4 = peek();

            if (bytefat4 != -1) {
                char ch = (char) bytefat4;

                if (ch == '\n' || ch == '\r') {
                    /* Case: newline (CR or CR LF) */
                    if (ch == '\n') {
                        token = new CocoToken(CocoLanguage.SPACE,
                                              CocoTokenTypes.SEPARATOR_TOKEN,
                                              mRecordRow,
                                              mRecordCol);
                        token.setIsEOL(true);
                        mRecordRow++;
                        mRecordCol = 1;
                        read();
                    } else if (isCharsAvailable(2)) {
                        char[] peeks = new char[2];
                        peek(peeks);
                        char ch2 = peeks[1];
                        if (ch2 == '\n') {
                            token = new CocoToken(CocoLanguage.SPACE,
                                                  CocoTokenTypes.SEPARATOR_TOKEN,
                                                  mRecordRow,
                                                  mRecordCol);
                            token.setIsEOL(true);
                            mRecordRow++;
                            mRecordCol = 1;
                            read();
                            read();
                        }
                    }
                } else if (Character.isDigit(ch)) {
                    /* Case: digit */
                    token = getNumeric();
                } else if (Character.isLetter(ch)) {
                    /*
                     * Case: alpha
                     * ... but some delimiters begin with alphas ...
                     */
                    if (isPrettyDamnAnnoyingDelimiterNext()) {
                        token = getSeparator();
                    } else {
                        token = getAlphaNumeric();
                    }
                } else if (!CocoLanguage.isInCobolCharSet(ch)) {
                    /* Case: character in system's set but not Cobol's set */
                    read(); // discard
                    token = new CocoToken(String.valueOf(ch),
                                          CocoTokenTypes.NONCOBOL_TOKEN,
                                          mRecordRow,
                                          mRecordCol);
                    movePosition(1);
                } else {
                    /* Case: special character or delimiter */
                    if (isSeparatorNext()) {
                        token = getSeparator();
                    } else {
                        token = getCobolCharacter();
                    }
                }
            } else {
                // EOF - empty string is used to indicate this special token
                token = new CocoToken("EOF",
                        CocoTokenTypes.EOF_TOKEN,
                        mRecordRow,
                        mRecordCol);
            }
        } catch (IOException ioe) {
            bytefat4 = -1;
        }

        return token;
    }

    /**
     * Put back a token into the token stream.  The next call to
     * {@link #getNextToken} produces the re-inserted token. The method doesn't
     * actually check that the specified token is the same one it emitted in the
     * last prior call to getNextToken, so you can cheat, but cheating is bad.
     *
     * @param  token Token to re-insert in "front" of the token stream
     * @throws java.lang.IllegalArgumentException if token is null
     */
    public void ungetToken(CocoToken token)
            throws IllegalArgumentException {
        if (token == null) {
            throw new IllegalArgumentException();
        }
        mUndoTokens.add(token);
    }

    /**
     * Scan for an alphanumeric lexeme.
     *
     * @return alphanumeric token, or null if no (or no valid) input left to form one
     * @throws java.io.IOException if an I/O error occurs
     */
    private CocoToken getAlphaNumeric() throws IOException {
        StringBuffer buffer = new StringBuffer();

        while (isAlphaOrDigitNext()) {
            buffer.append((char) read());
        }

        CocoToken token = null;
        if (buffer.length() > 0) {
            token = new CocoToken(buffer.toString(),
                    CocoTokenTypes.ALNUM_TOKEN,
                    mRecordRow,
                    mRecordCol);
            movePosition(token.getLength());
        }
        return token;
    }

    /**
     * Scan for a numeric lexeme. If a letter character is encountered, it is
     * tolerated, and the method ends up returning an alphanumeric token instead.
     *
     * @return numeric or alphanumeric token, or null if no (or no valid) input
     *         left to form one
     * @throws java.io.IOException if an I/O error occurs
     */
    private CocoToken getNumeric() throws IOException {
        StringBuffer buffer = new StringBuffer();

        /* preliminary guess */
        CocoTokenTypes tokenType = CocoTokenTypes.NUM_TOKEN;

        while (isAlphaOrDigitNext()) {
            char ch = (char) read();

            /* Accept alphas, but if I do, return an alphanumeric token instead. */
            if (Character.isLetter(ch)) {
                tokenType = CocoTokenTypes.ALNUM_TOKEN;
            }
            buffer.append(ch);
        }

        CocoToken token = null;
        if (buffer.length() > 0) {
            token = new CocoToken(buffer.toString(),
                    tokenType, mRecordRow, mRecordCol);
            movePosition(token.getLength());
        }
        return token;
    }

    /**
     * Scan for a Cobol character.
     *
     * @return Cobol character token, or null if no (or no valid) input left to
     *         form one
     * @throws java.io.IOException if an I/O error occurs
     */
    private CocoToken getCobolCharacter() throws IOException {
        CocoToken token = null;
        char ch = (char) peek();
        if (CocoLanguage.isInCobolCharSet(ch)) {
            token = new CocoToken(String.valueOf(ch),
                    CocoTokenTypes.SPECIALCHAR_TOKEN,
                    mRecordRow,
                    mRecordCol);
            movePosition(1);
            read();
        }
        return token;
    }


    /**
     * Scan for a Cobol separator.
     *
     * @return Cobol separator token, or null if no (or no valid) input left to
     *         form one
     * @throws java.io.IOException if an I/O error occurs
     */
    private CocoToken getSeparator() throws IOException {

        CocoToken token = null;

        while (token == null && isCharsAvailable(1) ) {

            int chi = read();
            char ch = (char) chi;

            // b== is a separator
            if (ch == ' ') {
                if (isCharsAvailable(2)) {
                    char[] peeks = new char[2];
                    peek(peeks);
                    char ch2 = peeks[0];
                    char ch3 = peeks[1];
                    if (ch2 == '=' && ch3 == '=') {
                        token = new CocoToken("==",
                                CocoTokenTypes.SEPARATOR_TOKEN,
                                mRecordRow,
                                mRecordCol);
                        movePosition(3);
                        read();
                        read();
                    }
                }
                if (token == null) {
                    if (CocoLanguage.isSeparator(ch)) {
                        token = new CocoToken(String.valueOf(ch),
                                CocoTokenTypes.SEPARATOR_TOKEN,
                                mRecordRow,
                                mRecordCol);
                        movePosition(1);
                    } else {
                        unread(chi);
                        break;
                    }
                }
            }

            // Z", X", N", and G" are separators
            if (("GNXZ".indexOf(ch) != -1) && isCharsAvailable(1)) {
                char ch1 = Character.toUpperCase(ch);
                char ch2 = Character.toUpperCase((char) peek());
                if (CocoLanguage.isSeparator(ch1, ch2)) {
                    char[] c = new char[2];
                    c[0] = ch1;
                    c[1] = ch2;
                    token = new CocoToken(new String(c, 0, 2),
                            CocoTokenTypes.SEPARATOR_TOKEN,
                            mRecordRow,
                            mRecordCol);
                    movePosition(2);
                    read();
                }
            }

            // covers every other case
            if (token == null) {
                if (CocoLanguage.isSeparator(ch)) {
                    token = new CocoToken(String.valueOf(ch),
                            CocoTokenTypes.SEPARATOR_TOKEN,
                            mRecordRow,
                            mRecordCol);
                    movePosition(1);
                } else {
                    unread(chi);
                    break;
                }
            }
        }

        return token;
    }

    /**
     * Determine if the next input character is a numeric or alpha character.
     *
     * @return true if the next character is numeric or alpha, else false
     * @throws java.io.IOException if an I/O error occurs
     */
    private boolean isAlphaOrDigitNext() throws IOException {
        boolean isIt = false;
        int value = peek();
        if (value != -1) {
            char ch = (char) value;
            isIt = Character.isLetterOrDigit(ch);
        }
        return isIt;
    }

    /**
     * Determine if the next input character is in the Cobol character set.
     *
     * @return true if the next character is in the set, else false
     * @throws java.io.IOException if an I/O error occurs
     */
    private boolean isCobolCharNext() throws IOException {
        boolean isIt = false;
        int value = peek();
        if (value != -1) {
            char ch = (char) value;
            isIt = com.sun.encoder.coco.model.CocoLanguage.isInCobolCharSet(ch);
        }
        return isIt;
    }

    /**
     * Determine if the next input character is a separator.
     *
     * @return true if the next character is a separator, else false
     * @throws java.io.IOException if an I/O error occurs
     */
    private boolean isSeparatorNext() throws IOException {
        boolean isIt = false;
        int value = peek();
        if (value != -1) {
            char ch = (char) value;
            isIt = com.sun.encoder.coco.model.CocoLanguage.isSeparator(ch);
        }
        return isIt;
    }

    /**
     * Determine if the next few input characters in the token stream comprise
     * a multi-byte delimiters: X", Z", N", G" or the sequence == preceeded by a
     * space
     *
     * @return true if one of these delimiters have been found, else false
     */
    private boolean isPrettyDamnAnnoyingDelimiterNext() throws IOException {
        boolean isIt = false;
        char[] c = new char[3];
        int len  = peek(c);

        if (len >= 2) {
            switch (Character.toUpperCase(c[0])) {
                case 'X':
                case 'Z':
                case 'N':
                case 'G':
                    isIt = CocoLanguage.isSeparator(c[0], c[1]);
                    break;
                case ' ':
                    isIt = (len == 3);
                    isIt = CocoLanguage.isSeparator(c[1], c[2]);
            }
        }
        return isIt;
    }

    /**
     * Determine if the token input has been exhausted.
     *
     * @return true if there is no more input, else false
     * @throws java.io.IOException if an I/O error occurs
     */
    private boolean isEod() throws IOException {
        boolean isIt = false;
        int value = peek();
        isIt = (value == -1);
        return isIt;
    }

    /**
     * Update row and column counters by the specified displacement.
     * The counters are coordinates into a position in a 2-axis view of the lexer's
     * input (i.e., view as a Cobol source file).  This "file" has a width
     * determined by {@link com.sun.encoder.coco.model.CocoParser#SOURCE_LINE_LENGTH}.
     *
     * <p>When amount is a positive value, the counters are updated as if the
     * position they currently represent is displaced "forward" in the file by the
     * indicated amount.  When amount is negative, the counters update to display
     * the position "backward".</p>
     *
     * <p>So, for example, if the file view has a width of 72, and
     * column = 71, and row = 10, and amount = 3, then in moving forward 3 units
     * causes the changes: column = 2, row = 11.</p>
     *
     * @param amount Positive or negative value indicating size of forward or
     *               backward displacement
     */
    private void movePosition(int amount) {
        mRecordCol += amount;
        if (!mIsDisable72ColumnLimit) {
            if (mRecordCol > CocoParser.SOURCE_LINE_LENGTH) {
                mRecordRow += mRecordCol / CocoParser.SOURCE_LINE_LENGTH;
                mRecordCol = mRecordCol % CocoParser.SOURCE_LINE_LENGTH;
            }
        }
    }

    /**
     * Obtain the next character from the lexer input source without consuming
     * the character.
     *
     * @return Next input character or -1 if EOF/EOD occured
     *
     * @throws IOException if an I/O error occurs
     */
    private int peek() throws IOException {
        int value = mInputReader.read();
        if (value != -1) {
            mInputReader.unread(value);
        }
        return value;
    }

    /**
     * Fill an array with characters from the lexer input source without
     * removing the the characters from the input stream.
     *
     * @param c array to fill
     *
     * @return Number of elements in c actually filled with input data; 0 if c
     *         is a zero-size array; -1 if c could not be filled because EOF/EOD
     *         occured before obtaining any characters
     *
     * @throws IOException          if an I/O error occurs
     * @throws NullPointerException if c is null
     */
    private int peek(char[] c)
            throws IOException, NullPointerException {
        int count = 0;
        int value = 0;
        int[] buf = new int[c.length];
        while ((value != -1) && (count < c.length)) {
            value = mInputReader.read();
            buf[count] = value;
            if (value != -1) {
                c[count++] = (char) value;
            }
        }
        if (count > 0) {
            mInputReader.unread(c, 0, count);
        }
        return count;
    }
    
    /**
     * Determines whether the next input stream read request for a given number
     * of characters will succeed.  Success is defined as, not encountering an
     * EOD.  Stream readiness (as reported by {@link InputStreamReader#ready()})
     * is not factored in the decision because blocking is desirable. This call
     * may itself block if the stream is not ready.
     *
     * @param mincount Desired number of ready characters in the stream
     *
     * @return <code>true</code> if the next read request for max(1,
     *         <code>mincount</code>) will neither block or encounter EOD
     *
     * @throws IOException if an I/O error occurs
     */ 
    private boolean isCharsAvailable(int mincount)
            throws IOException {
        int count;
        int got;
        char[] cr;
        
        count = Math.max(0, mincount);
        cr = new char[count];
        got = peek(cr);
        
        return (got == count);
    }
    
    private int read() throws IOException {
        return mInputReader.read();
    }

    private void unread(int chi) throws IOException {
        mInputReader.unread(chi);
    }
}