/*
 * RegExp.cs
 *
 * This work is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * This work is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 *
 * As a special exception, the copyright holders of this library give
 * you permission to link this library with independent modules to
 * produce an executable, regardless of the license terms of these
 * independent modules, and to copy and distribute the resulting
 * executable under terms of your choice, provided that you also meet,
 * for each linked independent module, the terms and conditions of the
 * license of that module. An independent module is a module which is
 * not derived from or based on this library. If you modify this
 * library, you may extend this exception to your version of the
 * library, but you are not obligated to do so. If you do not wish to
 * do so, delete this exception statement from your version.
 *
 * Copyright (c) 2003 Per Cederberg. All rights reserved.
 */

using System;
using System.Collections;
using System.IO;
using System.Globalization;
using System.Text;

namespace PerCederberg.Grammatica.Parser.RE {

    /**
     * A regular expression. This class creates and holds an internal
     * data structure representing a regular expression. It also
     * allows creating matchers. This class is thread-safe. Multiple
     * matchers may operate simultanously on the same regular
     * expression.
     *
     * @author   Per Cederberg, <per at percederberg dot net>
     * @version  1.0
     */
    public class RegExp {

        /**
         * The base regular expression element.
         */
        private Element element;

        /**
         * The regular expression pattern.
         */
        private string pattern;

        /**
         * The current position in the pattern. This variable is used by
         * the parsing methods.
         */
        private int pos;

        /**
         * Creates a new regular expression.
         * 
         * @param pattern        the regular expression pattern
         * 
         * @throws RegExpException if the regular expression couldn't be
         *             parsed correctly
         */
        public RegExp(string pattern) {
            this.pattern = pattern;
            this.pos = 0;
            this.element = ParseExpr();
            if (pos < pattern.Length) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            }
        }

        /**
         * Creates a new matcher for the specified string.
         * 
         * @param str            the string to work with
         * 
         * @return the regular expresion matcher
         */
        public Matcher Matcher(string str) {
            return new Matcher((Element) element.Clone(), str);
        }

        /**
         * Returns a string representation of the regular expression.
         * 
         * @return a string representation of the regular expression
         */
        public override string ToString() {
            StringWriter  str;
            
            str = new StringWriter();
            str.WriteLine("Regular Expression");
            str.WriteLine("  Pattern: " + pattern);
            str.WriteLine("  Compiled:");
            element.PrintTo(str, "    ");
            return str.ToString();
        }

        /**
         * Parses a regular expression. This method handles the Expr
         * production in the grammar (see regexp.grammar).
         * 
         * @return the element representing this expression
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseExpr() {
            Element  first;
            Element  second;
        
            first = ParseTerm();
            if (PeekChar(0) != '|') {
                return first;
            } else {
                ReadChar('|');
                second = ParseExpr();
                return new AlternativeElement(first, second);
            }
        }
    
        /**
         * Parses a regular expression term. This method handles the 
         * Term production in the grammar (see regexp.grammar).
         * 
         * @return the element representing this term
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseTerm() {
            ArrayList  list = new ArrayList();
        
            list.Add(ParseFact());
            while (true) {
                switch (PeekChar(0)) {
                case -1:
                case ')':
                case ']':
                case '{':
                case '}':
                case '?':
                case '+':
                case '|':
                    return CombineElements(list);
                default:
                    list.Add(ParseFact());
                    break;
                }
            }
        }

        /**
         * Parses a regular expression factor. This method handles the 
         * Fact production in the grammar (see regexp.grammar).
         * 
         * @return the element representing this factor
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseFact() {
            Element  elem;

            elem = ParseAtom();
            switch (PeekChar(0)) {
            case '?':
            case '*':
            case '+':
            case '{':
                return ParseAtomModifier(elem);
            default:
                return elem;
            }
        }

        /**
         * Parses a regular expression atom. This method handles the 
         * Atom production in the grammar (see regexp.grammar).
         * 
         * @return the element representing this atom
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseAtom() {
            Element  elem;

            switch (PeekChar(0)) {
            case '.':
                ReadChar('.');
                return CharacterSetElement.DOT;
            case '(':
                ReadChar('(');
                elem = ParseExpr();
                ReadChar(')');
                return elem;
            case '[':
                ReadChar('[');
                elem = ParseCharSet();
                ReadChar(']');
                return elem;
            case -1:
            case ')':
            case ']':
            case '{':
            case '}':
            case '?':
            case '*':
            case '+':
            case '|':
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            default:
                return ParseChar();
            }
        }

        /**
         * Parses a regular expression atom modifier. This method handles 
         * the AtomModifier production in the grammar (see regexp.grammar).
         *
         * @param elem           the element to modify
         *  
         * @return the modified element 
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseAtomModifier(Element elem) {
            int                       min = 0;
            int                       max = -1;
            RepeatElement.RepeatType  type;
            int                       firstPos;

            // Read min and max
            type = RepeatElement.RepeatType.GREEDY;
            switch (ReadChar()) {
            case '?':
                min = 0;
                max = 1;
                break;
            case '*':
                min = 0;
                max = -1;
                break;
            case '+':
                min = 1;
                max = -1;
                break;
            case '{':
                firstPos = pos -1;
                min = ReadNumber();
                max = min;
                if (PeekChar(0) == ',') {
                    ReadChar(',');
                    max = -1;
                    if (PeekChar(0) != '}') {
                        max = ReadNumber();
                    }
                }
                ReadChar('}');
                if (max == 0 || (max > 0 && min > max)) {
                    throw new RegExpException(
                        RegExpException.ErrorType.INVALID_REPEAT_COUNT,
                        firstPos,
                        pattern);
                }
                break;
            default:
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos -1,
                    pattern);
            }
            
            // Read operator mode
            if (PeekChar(0) == '?') {
                ReadChar('?');
                type = RepeatElement.RepeatType.RELUCTANT;
            } else if (PeekChar(0) == '+') {
                ReadChar('+');
                type = RepeatElement.RepeatType.POSSESSIVE;
            }
            
            return new RepeatElement(elem, min, max, type);
        }

        /**
         * Parses a regular expression character set. This method handles 
         * the contents of the '[...]' construct in a regular expression.
         * 
         * @return the element representing this character set
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseCharSet() {
            CharacterSetElement  charset;
            Element              elem;
            bool                 repeat = true;
            char                 start;
            char                 end;
            
            if (PeekChar(0) == '^') {
                ReadChar('^');
                charset = new CharacterSetElement(true);
            } else {
                charset = new CharacterSetElement(false);
            }
            
            while (PeekChar(0) > 0 && repeat) {
                start = (char) PeekChar(0);
                switch (start) {
                case ']':
                    repeat = false;
                    break;
                case '\\':
                    elem = ParseEscapeChar();
                    if (elem is StringElement) {
                        charset.AddCharacters((StringElement) elem);
                    } else {
                        charset.AddCharacterSet((CharacterSetElement) elem);
                    }
                    break;
                default:
                    ReadChar(start);
                    if (PeekChar(0) == '-'
                        && PeekChar(1) > 0 
                        && PeekChar(1) != ']') {
                        
                        ReadChar('-');
                        end = ReadChar();
                        charset.AddRange(start, end);
                    } else {
                        charset.AddCharacter(start);
                    }
                    break;
                }
            }
        
            return charset;
        }

        /**
         * Parses a regular expression character. This method handles 
         * a single normal character in a regular expression.
         * 
         * @return the element representing this character
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseChar() {
            switch (PeekChar(0)) {
            case '\\':
                return ParseEscapeChar();
            case '^':
            case '$':
                throw new RegExpException(
                    RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
                    pos,
                    pattern);
            default:
                return new StringElement(ReadChar());
            }
        }

        /**
         * Parses a regular expression character escape. This method 
         * handles a single character escape in a regular expression.
         * 
         * @return the element representing this character escape
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private Element ParseEscapeChar() {
            char    c;
            string  str;
            int     value;
        
            ReadChar('\\');
            c = ReadChar();
            switch (c) {
            case '0':
                c = ReadChar();
                if (c < '0' || c > '3') {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -3,
                        pattern);
                }
                value = c -'0';
                c = (char) PeekChar(0);
                if ('0' <= c && c <= '7') {
                    value *= 8;
                    value += ReadChar() -'0';
                    c = (char) PeekChar(0);
                    if ('0' <= c && c <= '7') {
                        value *= 8;
                        value += ReadChar() -'0';
                    }
                }
                return new StringElement((char) value);
            case 'x':
                str = ReadChar().ToString() + 
                      ReadChar().ToString();
                try {
                    value = Int32.Parse(str, 
                                        NumberStyles.AllowHexSpecifier);
                    return new StringElement((char) value);
                } catch (FormatException) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -str.Length -2,
                        pattern);
                }
            case 'u':
                str = ReadChar().ToString() + 
                      ReadChar().ToString() +
                      ReadChar().ToString() +
                      ReadChar().ToString();
                try {
                    value = Int32.Parse(str, 
                                        NumberStyles.AllowHexSpecifier);
                    return new StringElement((char) value);
                } catch (FormatException) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -str.Length -2,
                        pattern);
                }
            case 't':
                return new StringElement('\t');
            case 'n':
                return new StringElement('\n');
            case 'r':
                return new StringElement('\r');
            case 'f':
                return new StringElement('\f');
            case 'a':
                return new StringElement('\u0007');
            case 'e':
                return new StringElement('\u001B');
            case 'd':
                return CharacterSetElement.DIGIT;
            case 'D':
                return CharacterSetElement.NON_DIGIT;
            case 's':
                return CharacterSetElement.WHITESPACE;
            case 'S':
                return CharacterSetElement.NON_WHITESPACE;
            case 'w':
                return CharacterSetElement.WORD;
            case 'W':
                return CharacterSetElement.NON_WORD;
            default:
                if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
                    throw new RegExpException(
                        RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
                        pos -2,
                        pattern);             
                }
                return new StringElement(c);
            }
        }

        /**
         * Reads a number from the pattern. If the next character isn't a
         * numeric character, an exception is thrown. This method reads
         * several consecutive numeric characters. 
         * 
         * @return the numeric value read
         * 
         * @throws RegExpException if an error was encountered in the 
         *             pattern string
         */
        private int ReadNumber() {
            StringBuilder  buf = new StringBuilder();
            int            c;
            
            c = PeekChar(0);
            while ('0' <= c && c <= '9') {
                buf.Append(ReadChar());
                c = PeekChar(0);
            }
            if (buf.Length <= 0) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER,
                    pos,
                    pattern);
            }
            return Int32.Parse(buf.ToString());
        }

        /**
         * Reads the next character in the pattern. If no next character
         * exists, an exception is thrown.
         * 
         * @return the character read 
         * 
         * @throws RegExpException if no next character was available in  
         *             the pattern string
         */
        private char ReadChar() {
            int  c = PeekChar(0);
            
            if (c < 0) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNTERMINATED_PATTERN, 
                    pos,
                    pattern);
            } else {
                pos++;
                return (char) c;
            }
        }

        /**
         * Reads the next character in the pattern. If the character 
         * wasn't the specified one, an exception is thrown.
         * 
         * @param c              the character to read
         * 
         * @return the character read 
         * 
         * @throws RegExpException if the character read didn't match the
         *             specified one, or if no next character was 
         *             available in the pattern string
         */
        private char ReadChar(char c) {
            if (c != ReadChar()) {
                throw new RegExpException(
                    RegExpException.ErrorType.UNEXPECTED_CHARACTER, 
                    pos -1,
                    pattern);
            }
            return c;
        }

        /**
         * Returns a character that has not yet been read from the 
         * pattern. If the requested position is beyond the end of the 
         * pattern string, -1 is returned.
         * 
         * @param count          the preview position, from zero (0)
         * 
         * @return the character found, or
         *         -1 if beyond the end of the pattern string
         */
        private int PeekChar(int count) {
            if (pos + count < pattern.Length) {
                return pattern[pos + count];
            } else {
                return -1;
            }
        }
    
        /**
         * Combines a list of elements. This method takes care to always 
         * concatenate adjacent string elements into a single string 
         * element.  
         * 
         * @param list           the list with elements
         * 
         * @return the combined element
         */
        private Element CombineElements(ArrayList list) {
            Element  prev;
            Element  elem;
            string   str;
            int      i;

            // Concatenate string elements
            prev = (Element) list[0];
            for (i = 1; i < list.Count; i++) {
                elem = (Element) list[i];
                if (prev is StringElement 
                 && elem is StringElement) {

                    str = ((StringElement) prev).GetString() +
                          ((StringElement) elem).GetString();
                    elem = new StringElement(str);
                    list.RemoveAt(i);
                    list[i -1] = elem;
                    i--;
                }
                prev = elem;
            }

            // Combine all remaining elements
            elem = (Element) list[list.Count -1];
            for (i = list.Count -2; i >= 0; i--) {
                prev = (Element) list[i];
                elem = new CombineElement(prev, elem);
            }

            return elem;
        }
    }
}