001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3;
018
019 import java.io.Serializable;
020 import java.util.Collections;
021 import java.util.HashMap;
022 import java.util.HashSet;
023 import java.util.Map;
024 import java.util.Set;
025
026 /**
027 * <p>A set of characters.</p>
028 *
029 * <p>Instances are immutable, but instances of subclasses may not be.</p>
030 *
031 * <p>#ThreadSafe#</p>
032 * @author Apache Software Foundation
033 * @author Phil Steitz
034 * @author Pete Gieser
035 * @author Gary Gregory
036 * @since 1.0
037 * @version $Id: CharSet.java 918868 2010-03-04 06:22:16Z bayard $
038 */
039 public class CharSet implements Serializable {
040
041 /**
042 * Required for serialization support. Lang version 2.0.
043 *
044 * @see java.io.Serializable
045 */
046 private static final long serialVersionUID = 5947847346149275958L;
047
048 /**
049 * A CharSet defining no characters.
050 * @since 2.0
051 */
052 public static final CharSet EMPTY = new CharSet((String) null);
053
054 /**
055 * A CharSet defining ASCII alphabetic characters "a-zA-Z".
056 * @since 2.0
057 */
058 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
059
060 /**
061 * A CharSet defining ASCII alphabetic characters "a-z".
062 * @since 2.0
063 */
064 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
065
066 /**
067 * A CharSet defining ASCII alphabetic characters "A-Z".
068 * @since 2.0
069 */
070 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
071
072 /**
073 * A CharSet defining ASCII alphabetic characters "0-9".
074 * @since 2.0
075 */
076 public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
077
078 /**
079 * A Map of the common cases used in the factory.
080 * Subclasses can add more common patterns if desired
081 * @since 2.0
082 */
083 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<String, CharSet>());
084
085 static {
086 COMMON.put(null, EMPTY);
087 COMMON.put("", EMPTY);
088 COMMON.put("a-zA-Z", ASCII_ALPHA);
089 COMMON.put("A-Za-z", ASCII_ALPHA);
090 COMMON.put("a-z", ASCII_ALPHA_LOWER);
091 COMMON.put("A-Z", ASCII_ALPHA_UPPER);
092 COMMON.put("0-9", ASCII_NUMERIC);
093 }
094
095 /** The set of CharRange objects. */
096 private final Set<CharRange> set = new HashSet<CharRange>();
097
098 //-----------------------------------------------------------------------
099 /**
100 * <p>Factory method to create a new CharSet using a special syntax.</p>
101 *
102 * <ul>
103 * <li><code>null</code> or empty string ("")
104 * - set containing no characters</li>
105 * <li>Single character, such as "a"
106 * - set containing just that character</li>
107 * <li>Multi character, such as "a-e"
108 * - set containing characters from one character to the other</li>
109 * <li>Negated, such as "^a" or "^a-e"
110 * - set containing all characters except those defined</li>
111 * <li>Combinations, such as "abe-g"
112 * - set containing all the characters from the individual sets</li>
113 * </ul>
114 *
115 * <p>The matching order is:</p>
116 * <ol>
117 * <li>Negated multi character range, such as "^a-e"
118 * <li>Ordinary multi character range, such as "a-e"
119 * <li>Negated single character, such as "^a"
120 * <li>Ordinary single character, such as "a"
121 * </ol>
122 * <p>Matching works left to right. Once a match is found the
123 * search starts again from the next character.</p>
124 *
125 * <p>If the same range is defined twice using the same syntax, only
126 * one range will be kept.
127 * Thus, "a-ca-c" creates only one range of "a-c".</p>
128 *
129 * <p>If the start and end of a range are in the wrong order,
130 * they are reversed. Thus "a-e" is the same as "e-a".
131 * As a result, "a-ee-a" would create only one range,
132 * as the "a-e" and "e-a" are the same.</p>
133 *
134 * <p>The set of characters represented is the union of the specified ranges.</p>
135 *
136 * <p>All CharSet objects returned by this method will be immutable.</p>
137 *
138 * @param setStr the String describing the set, may be null
139 * @return a CharSet instance
140 * @since 2.0
141 */
142 public static CharSet getInstance(String setStr) {
143 Object set = COMMON.get(setStr);
144 if (set != null) {
145 return (CharSet) set;
146 }
147 return new CharSet(setStr);
148 }
149
150 /**
151 * <p>Constructs a new CharSet using the set syntax.
152 * Each string is merged in with the set.</p>
153 *
154 * @param setStrs Strings to merge into the initial set, may be null
155 * @return a CharSet instance
156 * @since 2.4
157 */
158 public static CharSet getInstance(String[] setStrs) {
159 if (setStrs == null) {
160 return null;
161 }
162 return new CharSet(setStrs);
163 }
164
165 //-----------------------------------------------------------------------
166 /**
167 * <p>Constructs a new CharSet using the set syntax.</p>
168 *
169 * @param setStr the String describing the set, may be null
170 * @since 2.0
171 */
172 protected CharSet(String setStr) {
173 super();
174 add(setStr);
175 }
176
177 /**
178 * <p>Constructs a new CharSet using the set syntax.
179 * Each string is merged in with the set.</p>
180 *
181 * @param set Strings to merge into the initial set
182 * @throws NullPointerException if set is <code>null</code>
183 */
184 protected CharSet(String[] set) {
185 super();
186 int sz = set.length;
187 for (int i = 0; i < sz; i++) {
188 add(set[i]);
189 }
190 }
191
192 //-----------------------------------------------------------------------
193 /**
194 * <p>Add a set definition string to the <code>CharSet</code>.</p>
195 *
196 * @param str set definition string
197 */
198 protected void add(String str) {
199 if (str == null) {
200 return;
201 }
202
203 int len = str.length();
204 int pos = 0;
205 while (pos < len) {
206 int remainder = (len - pos);
207 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
208 // negated range
209 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
210 pos += 4;
211 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
212 // range
213 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
214 pos += 3;
215 } else if (remainder >= 2 && str.charAt(pos) == '^') {
216 // negated char
217 set.add(CharRange.isNot(str.charAt(pos + 1)));
218 pos += 2;
219 } else {
220 // char
221 set.add(CharRange.is(str.charAt(pos)));
222 pos += 1;
223 }
224 }
225 }
226
227 //-----------------------------------------------------------------------
228 /**
229 * <p>Gets the internal set as an array of CharRange objects.</p>
230 *
231 * @return an array of immutable CharRange objects
232 * @since 2.0
233 */
234 public CharRange[] getCharRanges() {
235 return set.toArray(new CharRange[set.size()]);
236 }
237
238 //-----------------------------------------------------------------------
239 /**
240 * <p>Does the <code>CharSet</code> contain the specified
241 * character <code>ch</code>.</p>
242 *
243 * @param ch the character to check for
244 * @return <code>true</code> if the set contains the characters
245 */
246 public boolean contains(char ch) {
247 for (CharRange range : set) {
248 if (range.contains(ch)) {
249 return true;
250 }
251 }
252 return false;
253 }
254
255 // Basics
256 //-----------------------------------------------------------------------
257 /**
258 * <p>Compares two CharSet objects, returning true if they represent
259 * exactly the same set of characters defined in the same way.</p>
260 *
261 * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i>
262 * equal according to this method.</p>
263 *
264 * @param obj the object to compare to
265 * @return true if equal
266 * @since 2.0
267 */
268 @Override
269 public boolean equals(Object obj) {
270 if (obj == this) {
271 return true;
272 }
273 if (obj instanceof CharSet == false) {
274 return false;
275 }
276 CharSet other = (CharSet) obj;
277 return set.equals(other.set);
278 }
279
280 /**
281 * <p>Gets a hashCode compatible with the equals method.</p>
282 *
283 * @return a suitable hashCode
284 * @since 2.0
285 */
286 @Override
287 public int hashCode() {
288 return 89 + set.hashCode();
289 }
290
291 /**
292 * <p>Gets a string representation of the set.</p>
293 *
294 * @return string representation of the set
295 */
296 @Override
297 public String toString() {
298 return set.toString();
299 }
300
301 }