/* * CharacterSetElement.cs * * This work is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * This work is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * * As a special exception, the copyright holders of this library give * you permission to link this library with independent modules to * produce an executable, regardless of the license terms of these * independent modules, and to copy and distribute the resulting * executable under terms of your choice, provided that you also meet, * for each linked independent module, the terms and conditions of the * license of that module. An independent module is a module which is * not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the * library, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. * * Copyright (c) 2003 Per Cederberg. All rights reserved. */ using System.Collections; using System.IO; using System.Text; namespace PerCederberg.Grammatica.Parser.RE { /** * A regular expression character set element. This element * matches a single character inside (or outside) a character set. * The character set is user defined and may contain ranges of * characters. The set may also be inverted, meaning that only * characters not inside the set will be considered to match. * * @author Per Cederberg, <per at percederberg dot net> * @version 1.0 */ internal class CharacterSetElement : Element { /** * The dot ('.') character set. This element matches a single * character that is not equal to a newline character. */ public static CharacterSetElement DOT = new CharacterSetElement(false); /** * The digit character set. This element matches a single * numeric character. */ public static CharacterSetElement DIGIT = new CharacterSetElement(false); /** * The non-digit character set. This element matches a single * non-numeric character. */ public static CharacterSetElement NON_DIGIT = new CharacterSetElement(true); /** * The whitespace character set. This element matches a single * whitespace character. */ public static CharacterSetElement WHITESPACE = new CharacterSetElement(false); /** * The non-whitespace character set. This element matches a * single non-whitespace character. */ public static CharacterSetElement NON_WHITESPACE = new CharacterSetElement(true); /** * The word character set. This element matches a single word * character. */ public static CharacterSetElement WORD = new CharacterSetElement(false); /** * The non-word character set. This element matches a single * non-word character. */ public static CharacterSetElement NON_WORD = new CharacterSetElement(true); /** * The inverted character set flag. */ private bool inverted; /** * The character set content. This array may contain either * range objects or Character objects. */ private ArrayList contents = new ArrayList(); /** * Creates a new character set element. If the inverted character * set flag is set, only characters NOT in the set will match. * * @param inverted the inverted character set flag */ public CharacterSetElement(bool inverted) { this.inverted = inverted; } /** * Adds a single character to this character set. * * @param c the character to add */ public void AddCharacter(char c) { contents.Add(c); } /** * Adds multiple characters to this character set. * * @param str the string with characters to add */ public void AddCharacters(string str) { for (int i = 0; i < str.Length; i++) { AddCharacter(str[i]); } } /** * Adds multiple characters to this character set. * * @param elem the string element with characters to add */ public void AddCharacters(StringElement elem) { AddCharacters(elem.GetString()); } /** * Adds a character range to this character set. * * @param min the minimum character value * @param max the maximum character value */ public void AddRange(char min, char max) { contents.Add(new Range(min, max)); } /** * Adds a character subset to this character set. * * @param elem the character set to add */ public void AddCharacterSet(CharacterSetElement elem) { contents.Add(elem); } /** * Returns this element as the character set shouldn't be * modified after creation. This partially breaks the contract * of clone(), but as new characters are not added to the * character set after creation, this will work correctly. * * @return this character set element */ public override object Clone() { return this; } /** * Returns the length of a matching string starting at the * specified position. The number of matches to skip can also be * specified, but numbers higher than zero (0) cause a failed * match for any element that doesn't attempt to combine other * elements. * * @param m the matcher being used * @param str the string to match * @param start the starting position * @param skip the number of matches to skip * * @return the length of the matching string, or * -1 if no match was found */ public override int Match(Matcher m, string str, int start, int skip) { char c; if (skip != 0) { return -1; } if (start >= str.Length) { m.SetReadEndOfString(); return -1; } c = str[start]; return InSet(c) ? 1 : -1; } /** * Checks if the specified character matches this character * set. This method takes the inverted flag into account. * * @param c the character to check * * @return true if the character matches, or * false otherwise */ private bool InSet(char c) { if (this == DOT) { return InDotSet(c); } else if (this == DIGIT || this == NON_DIGIT) { return InDigitSet(c) != inverted; } else if (this == WHITESPACE || this == NON_WHITESPACE) { return InWhitespaceSet(c) != inverted; } else if (this == WORD || this == NON_WORD) { return InWordSet(c) != inverted; } else { return InUserSet(c) != inverted; } } /** * Checks if the specified character is present in the 'dot' * set. This method does not consider the inverted flag. * * @param c the character to check * * @return true if the character is present, or * false otherwise */ private bool InDotSet(char c) { switch (c) { case '\n': case '\r': case '\u0085': case '\u2028': case '\u2029': return false; default: return true; } } /** * Checks if the specified character is a digit. This method * does not consider the inverted flag. * * @param c the character to check * * @return true if the character is a digit, or * false otherwise */ private bool InDigitSet(char c) { return '0' <= c && c <= '9'; } /** * Checks if the specified character is a whitespace * character. This method does not consider the inverted flag. * * @param c the character to check * * @return true if the character is a whitespace character, or * false otherwise */ private bool InWhitespaceSet(char c) { switch (c) { case ' ': case '\t': case '\n': case '\f': case '\r': case (char) 11: return true; default: return false; } } /** * Checks if the specified character is a word character. This * method does not consider the inverted flag. * * @param c the character to check * * @return true if the character is a word character, or * false otherwise */ private bool InWordSet(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'; } /** * Checks if the specified character is present in the user- * defined set. This method does not consider the inverted * flag. * * @param value the character to check * * @return true if the character is present, or * false otherwise */ private bool InUserSet(char value) { object obj; char c; Range r; CharacterSetElement e; for (int i = 0; i < contents.Count; i++) { obj = contents[i]; if (obj is char) { c = (char) obj; if (c == value) { return true; } } else if (obj is Range) { r = (Range) obj; if (r.Inside(value)) { return true; } } else if (obj is CharacterSetElement) { e = (CharacterSetElement) obj; if (e.InSet(value)) { return true; } } } return false; } /** * Prints this element to the specified output stream. * * @param output the output stream to use * @param indent the current indentation */ public override void PrintTo(TextWriter output, string indent) { output.WriteLine(indent + ToString()); } /** * Returns a string description of this character set. * * @return a string description of this character set */ public override string ToString() { StringBuilder buffer; // Handle predefined character sets if (this == DOT) { return "."; } else if (this == DIGIT) { return "\\d"; } else if (this == NON_DIGIT) { return "\\D"; } else if (this == WHITESPACE) { return "\\s"; } else if (this == NON_WHITESPACE) { return "\\S"; } else if (this == WORD) { return "\\w"; } else if (this == NON_WORD) { return "\\W"; } // Handle user-defined character sets buffer = new StringBuilder(); if (inverted) { buffer.Append("^["); } else { buffer.Append("["); } for (int i = 0; i < contents.Count; i++) { buffer.Append(contents[i]); } buffer.Append("]"); return buffer.ToString(); } /** * A character range class. */ private class Range { /** * The minimum character value. */ private char min; /** * The maximum character value. */ private char max; /** * Creates a new character range. * * @param min the minimum character value * @param max the maximum character value */ public Range(char min, char max) { this.min = min; this.max = max; } /** * Checks if the specified character is inside the range. * * @param c the character to check * * @return true if the character is in the range, or * false otherwise */ public bool Inside(char c) { return min <= c && c <= max; } /** * Returns a string representation of this object. * * @return a string representation of this object */ public override string ToString() { return min + "-" + max; } } } }