/* * RegExp.cs * * This work is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * This work is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * As a special exception, the copyright holders of this library give * you permission to link this library with independent modules to * produce an executable, regardless of the license terms of these * independent modules, and to copy and distribute the resulting * executable under terms of your choice, provided that you also meet, * for each linked independent module, the terms and conditions of the * license of that module. An independent module is a module which is * not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the * library, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. * * Copyright (c) 2003 Per Cederberg. All rights reserved. */ using System; using System.Collections; using System.IO; using System.Globalization; using System.Text; namespace PerCederberg.Grammatica.Parser.RE { /** * A regular expression. This class creates and holds an internal * data structure representing a regular expression. It also * allows creating matchers. This class is thread-safe. Multiple * matchers may operate simultanously on the same regular * expression. * * @author Per Cederberg, <per at percederberg dot net> * @version 1.0 */ public class RegExp { /** * The base regular expression element. */ private Element element; /** * The regular expression pattern. */ private string pattern; /** * The current position in the pattern. This variable is used by * the parsing methods. */ private int pos; /** * Creates a new regular expression. * * @param pattern the regular expression pattern * * @throws RegExpException if the regular expression couldn't be * parsed correctly */ public RegExp(string pattern) { this.pattern = pattern; this.pos = 0; this.element = ParseExpr(); if (pos < pattern.Length) { throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); } } /** * Creates a new matcher for the specified string. * * @param str the string to work with * * @return the regular expresion matcher */ public Matcher Matcher(string str) { return new Matcher((Element) element.Clone(), str); } /** * Returns a string representation of the regular expression. * * @return a string representation of the regular expression */ public override string ToString() { StringWriter str; str = new StringWriter(); str.WriteLine("Regular Expression"); str.WriteLine(" Pattern: " + pattern); str.WriteLine(" Compiled:"); element.PrintTo(str, " "); return str.ToString(); } /** * Parses a regular expression. This method handles the Expr * production in the grammar (see regexp.grammar). * * @return the element representing this expression * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseExpr() { Element first; Element second; first = ParseTerm(); if (PeekChar(0) != '|') { return first; } else { ReadChar('|'); second = ParseExpr(); return new AlternativeElement(first, second); } } /** * Parses a regular expression term. This method handles the * Term production in the grammar (see regexp.grammar). * * @return the element representing this term * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseTerm() { ArrayList list = new ArrayList(); list.Add(ParseFact()); while (true) { switch (PeekChar(0)) { case -1: case ')': case ']': case '{': case '}': case '?': case '+': case '|': return CombineElements(list); default: list.Add(ParseFact()); break; } } } /** * Parses a regular expression factor. This method handles the * Fact production in the grammar (see regexp.grammar). * * @return the element representing this factor * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseFact() { Element elem; elem = ParseAtom(); switch (PeekChar(0)) { case '?': case '*': case '+': case '{': return ParseAtomModifier(elem); default: return elem; } } /** * Parses a regular expression atom. This method handles the * Atom production in the grammar (see regexp.grammar). * * @return the element representing this atom * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseAtom() { Element elem; switch (PeekChar(0)) { case '.': ReadChar('.'); return CharacterSetElement.DOT; case '(': ReadChar('('); elem = ParseExpr(); ReadChar(')'); return elem; case '[': ReadChar('['); elem = ParseCharSet(); ReadChar(']'); return elem; case -1: case ')': case ']': case '{': case '}': case '?': case '*': case '+': case '|': throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); default: return ParseChar(); } } /** * Parses a regular expression atom modifier. This method handles * the AtomModifier production in the grammar (see regexp.grammar). * * @param elem the element to modify * * @return the modified element * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseAtomModifier(Element elem) { int min = 0; int max = -1; RepeatElement.RepeatType type; int firstPos; // Read min and max type = RepeatElement.RepeatType.GREEDY; switch (ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': firstPos = pos -1; min = ReadNumber(); max = min; if (PeekChar(0) == ',') { ReadChar(','); max = -1; if (PeekChar(0) != '}') { max = ReadNumber(); } } ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos -1, pattern); } // Read operator mode if (PeekChar(0) == '?') { ReadChar('?'); type = RepeatElement.RepeatType.RELUCTANT; } else if (PeekChar(0) == '+') { ReadChar('+'); type = RepeatElement.RepeatType.POSSESSIVE; } return new RepeatElement(elem, min, max, type); } /** * Parses a regular expression character set. This method handles * the contents of the '[...]' construct in a regular expression. * * @return the element representing this character set * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseCharSet() { CharacterSetElement charset; Element elem; bool repeat = true; char start; char end; if (PeekChar(0) == '^') { ReadChar('^'); charset = new CharacterSetElement(true); } else { charset = new CharacterSetElement(false); } while (PeekChar(0) > 0 && repeat) { start = (char) PeekChar(0); switch (start) { case ']': repeat = false; break; case '\\': elem = ParseEscapeChar(); if (elem is StringElement) { charset.AddCharacters((StringElement) elem); } else { charset.AddCharacterSet((CharacterSetElement) elem); } break; default: ReadChar(start); if (PeekChar(0) == '-' && PeekChar(1) > 0 && PeekChar(1) != ']') { ReadChar('-'); end = ReadChar(); charset.AddRange(start, end); } else { charset.AddCharacter(start); } break; } } return charset; } /** * Parses a regular expression character. This method handles * a single normal character in a regular expression. * * @return the element representing this character * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseChar() { switch (PeekChar(0)) { case '\\': return ParseEscapeChar(); case '^': case '$': throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); default: return new StringElement(ReadChar()); } } /** * Parses a regular expression character escape. This method * handles a single character escape in a regular expression. * * @return the element representing this character escape * * @throws RegExpException if an error was encountered in the * pattern string */ private Element ParseEscapeChar() { char c; string str; int value; ReadChar('\\'); c = ReadChar(); switch (c) { case '0': c = ReadChar(); if (c < '0' || c > '3') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER, pos -3, pattern); } value = c -'0'; c = (char) PeekChar(0); if ('0' <= c && c <= '7') { value *= 8; value += ReadChar() -'0'; c = (char) PeekChar(0); if ('0' <= c && c <= '7') { value *= 8; value += ReadChar() -'0'; } } return new StringElement((char) value); case 'x': str = ReadChar().ToString() + ReadChar().ToString(); try { value = Int32.Parse(str, NumberStyles.AllowHexSpecifier); return new StringElement((char) value); } catch (FormatException) { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER, pos -str.Length -2, pattern); } case 'u': str = ReadChar().ToString() + ReadChar().ToString() + ReadChar().ToString() + ReadChar().ToString(); try { value = Int32.Parse(str, NumberStyles.AllowHexSpecifier); return new StringElement((char) value); } catch (FormatException) { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER, pos -str.Length -2, pattern); } case 't': return new StringElement('\t'); case 'n': return new StringElement('\n'); case 'r': return new StringElement('\r'); case 'f': return new StringElement('\f'); case 'a': return new StringElement('\u0007'); case 'e': return new StringElement('\u001B'); case 'd': return CharacterSetElement.DIGIT; case 'D': return CharacterSetElement.NON_DIGIT; case 's': return CharacterSetElement.WHITESPACE; case 'S': return CharacterSetElement.NON_WHITESPACE; case 'w': return CharacterSetElement.WORD; case 'W': return CharacterSetElement.NON_WORD; default: if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER, pos -2, pattern); } return new StringElement(c); } } /** * Reads a number from the pattern. If the next character isn't a * numeric character, an exception is thrown. This method reads * several consecutive numeric characters. * * @return the numeric value read * * @throws RegExpException if an error was encountered in the * pattern string */ private int ReadNumber() { StringBuilder buf = new StringBuilder(); int c; c = PeekChar(0); while ('0' <= c && c <= '9') { buf.Append(ReadChar()); c = PeekChar(0); } if (buf.Length <= 0) { throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); } return Int32.Parse(buf.ToString()); } /** * Reads the next character in the pattern. If no next character * exists, an exception is thrown. * * @return the character read * * @throws RegExpException if no next character was available in * the pattern string */ private char ReadChar() { int c = PeekChar(0); if (c < 0) { throw new RegExpException( RegExpException.ErrorType.UNTERMINATED_PATTERN, pos, pattern); } else { pos++; return (char) c; } } /** * Reads the next character in the pattern. If the character * wasn't the specified one, an exception is thrown. * * @param c the character to read * * @return the character read * * @throws RegExpException if the character read didn't match the * specified one, or if no next character was * available in the pattern string */ private char ReadChar(char c) { if (c != ReadChar()) { throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos -1, pattern); } return c; } /** * Returns a character that has not yet been read from the * pattern. If the requested position is beyond the end of the * pattern string, -1 is returned. * * @param count the preview position, from zero (0) * * @return the character found, or * -1 if beyond the end of the pattern string */ private int PeekChar(int count) { if (pos + count < pattern.Length) { return pattern[pos + count]; } else { return -1; } } /** * Combines a list of elements. This method takes care to always * concatenate adjacent string elements into a single string * element. * * @param list the list with elements * * @return the combined element */ private Element CombineElements(ArrayList list) { Element prev; Element elem; string str; int i; // Concatenate string elements prev = (Element) list[0]; for (i = 1; i < list.Count; i++) { elem = (Element) list[i]; if (prev is StringElement && elem is StringElement) { str = ((StringElement) prev).GetString() + ((StringElement) elem).GetString(); elem = new StringElement(str); list.RemoveAt(i); list[i -1] = elem; i--; } prev = elem; } // Combine all remaining elements elem = (Element) list[list.Count -1]; for (i = list.Count -2; i >= 0; i--) { prev = (Element) list[i]; elem = new CombineElement(prev, elem); } return elem; } } }