001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3.text.translate;
018
019 import java.io.IOException;
020 import java.io.Writer;
021
022 import java.util.EnumSet;
023 import java.util.Arrays;
024
025 /**
026 * Translates escaped unicode values of the form \\u+\d\d\d\d back to
027 * unicode.
028 *
029 * @author Apache Software Foundation
030 * @since 3.0
031 * @version $Id: UnicodeUnescaper.java 967237 2010-07-23 20:08:57Z mbenson $
032 */
033 public class UnicodeUnescaper extends CharSequenceTranslator {
034
035 public static enum OPTION { escapePlus }
036
037 // TODO: Create an OptionsSet class to hide some of the conditional logic below
038 private final EnumSet<OPTION> options;
039
040 public UnicodeUnescaper(OPTION... options) {
041 if(options.length > 0) {
042 this.options = EnumSet.copyOf(Arrays.asList(options));
043 } else {
044 this.options = null;
045 }
046 }
047
048 public boolean isSet(OPTION opt) {
049 return (options == null) ? false : options.contains(opt);
050 }
051
052 /**
053 * {@inheritDoc}
054 */
055 @Override
056 public int translate(CharSequence input, int index, Writer out) throws IOException {
057 if(input.charAt(index) == '\\') {
058 if( (index + 1 < input.length()) && input.charAt(index + 1) == 'u') {
059 // consume optional additional 'u' chars
060 int i=2;
061 while( (index + i < input.length()) && input.charAt(index + i) == 'u') {
062 i++;
063 }
064
065 // consume + symbol in \\u+0045
066 if(isSet(OPTION.escapePlus)) {
067 if( (index + i < input.length()) && (input.charAt(index + i) == '+') ) {
068 i++;
069 }
070 }
071
072 if( (index + i + 4 <= input.length()) ) {
073 // Get 4 hex digits
074 CharSequence unicode = input.subSequence(index + i, index + i + 4);
075
076 try {
077 int value = Integer.parseInt(unicode.toString(), 16);
078 out.write((char) value);
079 } catch (NumberFormatException nfe) {
080 throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe);
081 }
082 return i + 4;
083 } else {
084 throw new IllegalArgumentException("Less than 4 hex digits in unicode value: '" +
085 input.subSequence(index, input.length()) +
086 "' due to end of CharSequence");
087 }
088 }
089 }
090 return 0;
091 }
092 }