| /* |
| * Copyright (c) 2012, 2021 Oracle and/or its affiliates. All rights reserved. |
| * |
| * This program and the accompanying materials are made available under the |
| * terms of the Eclipse Public License v. 2.0 which is available at |
| * http://www.eclipse.org/legal/epl-2.0, |
| * or the Eclipse Distribution License v. 1.0 which is available at |
| * http://www.eclipse.org/org/documents/edl-v10.php. |
| * |
| * SPDX-License-Identifier: EPL-2.0 OR BSD-3-Clause |
| */ |
| |
| // Contributors: |
| // Oracle - initial API and implementation |
| // |
| package org.eclipse.persistence.jpa.jpql.tools.utility; |
| |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| /** |
| * This converter handles references when dealing with text or markup in an XML document. Those |
| * references (escape characters) are defined in ISO-8859-1 Reference. |
| * <p> |
| * The conversion supports both converting a numeric character reference (&#nnnn; where nnnn is |
| * the code point in decimal form or &xhhhh; where hhhh is the code point in hexadecimal point) |
| * and a character entity reference (&name; where name is the case-sensitive name of the entity). |
| * |
| * @version 2.5 |
| * @since 2.5 |
| * @author Pascal Filion |
| */ |
| @SuppressWarnings("nls") |
| public final class XmlEscapeCharacterConverter { |
| |
| /** |
| * The entity name for ampersand: <b>&</b>. |
| */ |
| public static final String AMPERSAND_ENTITY_NAME = "&"; |
| |
| /** |
| * The entity name for apostrophe: <b>&apos;</b>. |
| */ |
| public static final String APOSTROPHE_ENTITY_NAME = "'"; |
| |
| /** |
| * The map of symbol mapped to the unicode character. |
| */ |
| private static final Map<String, String> dictionary = buildDictionary(); |
| |
| /** |
| * The entity name for greater-than symbol: <b>></b>. |
| */ |
| public static final String GREATER_THAN_ENTITY_NAME = ">"; |
| |
| /** |
| * The entity name for less-than symbol: <b><</b>. |
| */ |
| public static final String LESS_THAN_ENTITY_NAME = "<"; |
| |
| /** |
| * The entity name for quotation mark: <b>"</b>. |
| */ |
| public static final String QUOTATION_MARK_NAME = """; |
| |
| /** |
| * Cannot instantiate <code>XmlEscapeCharacterConverter</code>. |
| */ |
| private XmlEscapeCharacterConverter() { |
| super(); |
| } |
| |
| private static Map<String, String> buildDictionary() { |
| |
| Map<String, String> dictionary = new HashMap<>(); |
| |
| // Reserved characters |
| dictionary.put("quot", "\""); // Quotation Mark |
| dictionary.put("apos", "'"); // Apostrophe |
| dictionary.put("amp", "&"); // Ampersand |
| dictionary.put("lt", "<"); // Less Than Symbol |
| dictionary.put("gt", ">"); // Greater Than Symbol |
| |
| // ISO-8859-1 symbols |
| dictionary.put("nbsp", "\u00A0"); // Nonbreaking space |
| dictionary.put("iexcl", "\u00A1"); // Inverted Exclamation Point |
| dictionary.put("cent", "\u00A2"); // Cent Sign |
| dictionary.put("pound", "\u00A3"); // Pound Sterling |
| dictionary.put("curren", "\u00A4"); // General Currency Sign |
| dictionary.put("yen", "\u00A5"); // Yen Sign |
| dictionary.put("brvbar", "\u00A6"); // Broken Vertical Bar |
| dictionary.put("sect", "\u00A7"); // Section Sign |
| dictionary.put("uml", "\u00A8"); // Umlaut |
| dictionary.put("copy", "\u00A9"); // Copyright |
| dictionary.put("ordf", "\u00AA"); // Feminine Ordinal |
| dictionary.put("laquo", "\u00AB"); // Left Angle Quote |
| dictionary.put("not", "\u00AC"); // Not Sign |
| dictionary.put("shy", "\u00AD"); // Soft Hyphen |
| dictionary.put("reg", "\u00AE"); // Registered Trademark |
| dictionary.put("macr", "\u00AF"); // Macron Accent |
| dictionary.put("deg", "\u00B0"); // Degree Sign |
| dictionary.put("plusmn", "\u00B1"); // Plus or Minus |
| dictionary.put("sup2", "\u00B2"); // Superscript Two |
| dictionary.put("sup3", "\u00B3"); // Superscript Three |
| dictionary.put("acute", "\u00B4"); // Acute Accent |
| dictionary.put("micro", "\u00B5"); // Micro Sign |
| dictionary.put("para", "\u00B6"); // Paragraph Sign |
| dictionary.put("middot", "\u00B7"); // Middle Dot |
| dictionary.put("cedil", "\u00B8"); // Cedilla |
| dictionary.put("sup1", "\u00B9"); // Superscript One |
| dictionary.put("ordm", "\u00BA"); // Masculine Ordinal |
| dictionary.put("raquo", "\u00BB"); // Right Angle Quote |
| dictionary.put("frac14", "\u00BC"); // Fraction One-Forth |
| dictionary.put("frac12", "\u00BD"); // Fraction One-Half |
| dictionary.put("frac34", "\u00BE"); // Fraction Three-Fourths |
| dictionary.put("iquest", "\u00BF"); // Inverted Question Mark |
| dictionary.put("times", "\u00D7"); // Multiplication |
| dictionary.put("divide", "\u00F7"); // Division |
| |
| // ISO-8859-1 characters |
| dictionary.put("Agrave", "\u00C0"); // Latin capital letter A with grave accent |
| dictionary.put("Aacute", "\u00C1"); // Latin capital letter A with acute accent |
| dictionary.put("Acirc", "\u00C2"); // Latin capital letter A with circumflex |
| dictionary.put("Atilde", "\u00C3"); // Latin capital letter A with tilde |
| dictionary.put("Auml", "\u00C4"); // Latin capital letter A with diaeresis |
| dictionary.put("Aring", "\u00C5"); // Latin capital letter A with ring above |
| dictionary.put("AElig", "\u00C6"); // Latin capital letter AE |
| dictionary.put("Ccedil", "\u00C7"); // Latin capital letter C with cedilla |
| dictionary.put("Egrave", "\u00C8"); // Latin capital letter E with grave accent |
| dictionary.put("Eacute", "\u00C9"); // Latin capital letter E with acute accent |
| dictionary.put("Ecirc", "\u00CA"); // Latin capital letter E with circumflex |
| dictionary.put("Euml", "\u00CB"); // Latin capital letter E with diaeresis |
| dictionary.put("Igrave", "\u00CC"); // Latin capital letter I with grave accent |
| dictionary.put("Iacute", "\u00CD"); // Latin capital letter I with acute accent |
| dictionary.put("Icirc", "\u00CE"); // Latin capital letter I with circumflex |
| dictionary.put("Iuml", "\u00CF"); // Latin capital letter I with diaeresis |
| dictionary.put("ETH", "\u00D0"); // Latin capital letter Eth |
| dictionary.put("Ntilde", "\u00D1"); // Latin capital letter N with tilde |
| dictionary.put("Ograve", "\u00D2"); // Latin capital letter O with grave accent |
| dictionary.put("Oacute", "\u00D3"); // Latin capital letter O with acute accent |
| dictionary.put("Ocirc", "\u00D4"); // Latin capital letter O with circumflex |
| dictionary.put("Otilde", "\u00D5"); // Latin capital letter O with tilde |
| dictionary.put("Ouml", "\u00D6"); // Latin capital letter O with diaeresis |
| dictionary.put("Oslash", "\u00D8"); // Latin capital letter O with stroke |
| dictionary.put("Ugrave", "\u00D9"); // Latin capital letter U with grave accent |
| dictionary.put("Uacute", "\u00DA"); // Latin capital letter U with acute accent |
| dictionary.put("Ucirc", "\u00DB"); // Latin capital letter U with circumflex |
| dictionary.put("Uuml", "\u00DC"); // Latin capital letter U with diaeresis |
| dictionary.put("Yacute", "\u00DD"); // Latin capital letter Y with acute accent |
| dictionary.put("THORN", "\u00DE"); // Latin capital letter THORN |
| dictionary.put("szlig", "\u00DF"); // Latin small letter sharp s |
| dictionary.put("agrave", "\u00E0"); // Latin small letter a with grave accent |
| dictionary.put("aacute", "\u00E1"); // Latin small letter a with acute accent |
| dictionary.put("acirc", "\u00E2"); // Latin small letter a with circumflex |
| dictionary.put("atilde", "\u00E3"); // Latin small letter a with tilde |
| dictionary.put("auml", "\u00E4"); // Latin small letter a with diaeresis |
| dictionary.put("aring", "\u00E5"); // Latin small letter a with ring above |
| dictionary.put("aelig", "\u00E6"); // Latin small letter ae |
| dictionary.put("ccedil", "\u00E7"); // Latin small letter c with cedilla |
| dictionary.put("egrave", "\u00E8"); // Latin small letter e with grave accent |
| dictionary.put("eacute", "\u00E9"); // Latin small letter e with acute accent |
| dictionary.put("ecirc", "\u00EA"); // Latin small letter e with circumflex |
| dictionary.put("euml", "\u00EB"); // Latin small letter e with diaeresis |
| dictionary.put("igrave", "\u00EC"); // Latin small letter i with grave accent |
| dictionary.put("iacute", "\u00ED"); // Latin small letter i with acute accent |
| dictionary.put("icirc", "\u00EE"); // Latin small letter i with circumflex |
| dictionary.put("iuml", "\u00EF"); // Latin small letter i with diaeresis |
| dictionary.put("eth", "\u00F0"); // Latin small letter eth |
| dictionary.put("ntilde", "\u00F1"); // Latin small letter n with tilde |
| dictionary.put("ograve", "\u00F2"); // Latin small letter o with grave accent |
| dictionary.put("oacute", "\u00F3"); // Latin small letter o with acute accent |
| dictionary.put("ocirc", "\u00F4"); // Latin small letter o with circumflex |
| dictionary.put("otilde", "\u00F5"); // Latin small letter o with tilde |
| dictionary.put("ouml", "\u00F6"); // Latin small letter o with diaeresis |
| dictionary.put("oslash", "\u00F8"); // Latin small letter o with stroke |
| dictionary.put("ugrave", "\u00F9"); // Latin small letter u with grave accent |
| dictionary.put("uacute", "\u00FA"); // Latin small letter u with acute accent |
| dictionary.put("ucirc", "\u00FB"); // Latin small letter u with circumflex |
| dictionary.put("uuml", "\u00FC"); // Latin small letter u with diaeresis |
| dictionary.put("yacute", "\u00FD"); // Latin small letter y with acute accent |
| dictionary.put("thorn", "\u00FE"); // Latin small letter thorn |
| dictionary.put("yuml", "\u00FF"); // Latin small letter y with diaeresis |
| |
| // Math Symbols |
| dictionary.put("forall", "\u2200"); // For all |
| dictionary.put("part", "\u2202"); // Partial differential |
| dictionary.put("exist", "\u2203"); // There exists |
| dictionary.put("empty", "\u2205"); // Empty set; Null Set; Diameter |
| dictionary.put("nabla", "\u2207"); // Nabla; Backward difference |
| dictionary.put("isin", "\u2208"); // Element of |
| dictionary.put("notin", "\u2209"); // Not an element of |
| dictionary.put("ni", "\u220B"); // Contains as member |
| dictionary.put("prod", "\u220F"); // N-ary product; Product sign |
| dictionary.put("sum", "\u2211"); // N-ary sumation |
| dictionary.put("minus", "\u2212"); // Minus sign |
| dictionary.put("lowast", "\u2217"); // Asterisk operator |
| dictionary.put("radic", "\u221A"); // Square root; Radical sign |
| dictionary.put("prop", "\u221D"); // Proportional to |
| dictionary.put("infin", "\u221E"); // Infinity |
| dictionary.put("ang", "\u2220"); // Angle |
| dictionary.put("and", "\u2227"); // Logical and; Wedge |
| dictionary.put("or", "\u2228"); // Logical or; Vee |
| dictionary.put("cap", "\u2229"); // Intersection; Cap |
| dictionary.put("cup", "\u222A"); // Union; Cup |
| dictionary.put("int", "\u222B"); // Integral |
| dictionary.put("there4", "\u2234"); // Therefore |
| dictionary.put("sim", "\u223C"); // Tilde operator; Varies with; Similar to |
| dictionary.put("cong", "\u2245"); // Approximately equal to |
| dictionary.put("asymp", "\u2248"); // Almost equal to; Asymptotic to |
| dictionary.put("ne", "\u2260"); // Not equal to |
| dictionary.put("equiv", "\u2261"); // Identical to |
| dictionary.put("le", "\u2264"); // Less-than or equal to |
| dictionary.put("ge", "\u2265"); // Greater-than or equal to |
| dictionary.put("sub", "\u2282"); // Subset of |
| dictionary.put("sup", "\u2283"); // Superset of |
| dictionary.put("nsub", "\u2284"); // Not a subset of |
| dictionary.put("sube", "\u2286"); // Subset of or equal to |
| dictionary.put("supe", "\u2287"); // Superset of or equal to |
| dictionary.put("oplus", "\u2295"); // Circled plus; Direct sum |
| dictionary.put("otimes", "\u2297"); // Circled times; Vector product |
| dictionary.put("perp", "\u22A5"); // Up tack; Orthogonal to; Perpendicular |
| dictionary.put("sdot", "\u22C5"); // Dot operator |
| |
| // Arrows |
| dictionary.put("larr", "\u2190"); // Leftwards arrow |
| dictionary.put("uarr", "\u2191"); // Upwards arrow |
| dictionary.put("rarr", "\u2192"); // Rightwards arrow |
| dictionary.put("darr", "\u2193"); // Downwards arrow |
| dictionary.put("harr", "\u2194"); // Left right arrow |
| dictionary.put("crarr", "\u21B5"); // Downwards arrow with corner leftwards; Carriage return symbol |
| dictionary.put("lArr", "\u21D0"); // Leftwards double arrow |
| dictionary.put("uArr", "\u21D1"); // Upwards double arrow |
| dictionary.put("rArr", "\u21D2"); // Rightwards double arrow |
| dictionary.put("dArr", "\u21D3"); // Downwards double arrow |
| dictionary.put("hArr", "\u21D4"); // Left right double arrow |
| |
| // Greek Capital Letters |
| dictionary.put("Alpha", "\u0391"); // Greek capital letter alpha |
| dictionary.put("Beta", "\u0392"); // Greek capital letter beta |
| dictionary.put("Gamma", "\u0393"); // Greek capital letter gamma |
| dictionary.put("Delta", "\u0394"); // Greek capital letter delta |
| dictionary.put("Epsilon", "\u0395"); // Greek capital letter epsilon |
| dictionary.put("Zeta", "\u0396"); // Greek capital letter zeta |
| dictionary.put("Eta", "\u0397"); // Greek capital letter eta |
| dictionary.put("Theta", "\u0398"); // Greek capital letter theta |
| dictionary.put("Iota", "\u0399"); // Greek capital letter iota |
| dictionary.put("Kappa", "\u039A"); // Greek capital letter kappa |
| dictionary.put("Lambda", "\u039B"); // Greek capital letter lambda |
| dictionary.put("Mu", "\u039C"); // Greek capital letter mu |
| dictionary.put("Nu", "\u039D"); // Greek capital letter nu |
| dictionary.put("Xi", "\u039E"); // Greek capital letter xi |
| dictionary.put("Omicron", "\u039F"); // Greek capital letter omicron |
| dictionary.put("Pi", "\u03A0"); // Greek capital letter pi |
| dictionary.put("Rho", "\u03A1"); // Greek capital letter rho |
| dictionary.put("Sigma", "\u03A3"); // Greek capital letter sigma |
| dictionary.put("Tau", "\u03A4"); // Greek capital letter tau |
| dictionary.put("Upsilon", "\u03A5"); // Greek capital letter upsilon |
| dictionary.put("Phi", "\u03A6"); // Greek capital letter phi |
| dictionary.put("Chi", "\u03A7"); // Greek capital letter chi |
| dictionary.put("Psi", "\u03A8"); // Greek capital letter psi |
| dictionary.put("Omega", "\u03A9"); // Greek capital letter omega |
| |
| // Greek Small Letters |
| dictionary.put("alpha", "\u03B1"); // Greek small letter alpha |
| dictionary.put("beta", "\u03B2"); // Greek small letter beta |
| dictionary.put("gamma", "\u03B3"); // Greek small letter gamma |
| dictionary.put("delta", "\u03B4"); // Greek small letter delta |
| dictionary.put("epsilon", "\u03B5"); // Greek small letter epsilon |
| dictionary.put("zeta", "\u03B6"); // Greek small letter zeta |
| dictionary.put("eta", "\u03B7"); // Greek small letter eta |
| dictionary.put("theta", "\u03B8"); // Greek small letter theta |
| dictionary.put("iota", "\u03B9"); // Greek small letter iota |
| dictionary.put("kappa", "\u03BA"); // Greek small letter kappa |
| dictionary.put("lambda", "\u03BB"); // Greek small letter lambda |
| dictionary.put("mu", "\u03BC"); // Greek small letter mu |
| dictionary.put("nu", "\u03BD"); // Greek small letter nu |
| dictionary.put("xi", "\u03BE"); // Greek small letter xi |
| dictionary.put("omicron", "\u03BF"); // Greek small letter omicron |
| dictionary.put("pi", "\u03C0"); // Greek small letter pi |
| dictionary.put("rho", "\u03C1"); // Greek small letter rho |
| dictionary.put("sigmaf", "\u03C2"); // Greek small letter final sigma |
| dictionary.put("sigma", "\u03C3"); // Greek small letter sigma |
| dictionary.put("tau", "\u03C4"); // Greek small letter tau |
| dictionary.put("upsilon", "\u03C5"); // Greek small letter upsilon |
| dictionary.put("phi", "\u03C6"); // Greek small letter phi |
| dictionary.put("chi", "\u03C7"); // Greek small letter chi |
| dictionary.put("psi", "\u03C8"); // Greek small letter psi |
| dictionary.put("omega", "\u03C9"); // Greek small letter omega |
| dictionary.put("theta", "\u03D1"); // Greek small letter theta symbol |
| dictionary.put("upsih", "\u03D2"); // Greek upsilon with hook symbol |
| dictionary.put("piv", "\u03D6"); // Greek pi symbol |
| |
| // Latin Extended-A and Letterlike Symbols |
| dictionary.put("OElig", "\u0152"); // Latin capital ligature oe |
| dictionary.put("oelig", "\u0153"); // Latin small ligature oe |
| dictionary.put("Scaron", "\u0160"); // Latin capital letter s with caron |
| dictionary.put("scaron", "\u0161"); // Latin small letter s with caron |
| dictionary.put("Yuml", "\u0178"); // Latin capital letter y with diaeresis |
| dictionary.put("fnof", "\u0192"); // Latin small f with hook |
| dictionary.put("weierp", "\u2118"); // Script capital P; Power set; Weierstrass p |
| dictionary.put("image", "\u2111"); // Blackletter capital I; Imaginary part |
| dictionary.put("real", "\u211C"); // Blackletter capital R; Real part symbol |
| dictionary.put("trade", "\u2122"); // Trade mark sign |
| dictionary.put("alefsym", "\u2135"); // Alef symbol; First transfinite cardinal |
| |
| // Miscellaneous Shapes |
| dictionary.put("spades", "\u2660"); // Black spade suit |
| dictionary.put("clubs", "\u2663"); // Black club suit; Shamrock |
| dictionary.put("hearts", "\u2665"); // Black heart suit; Valentine |
| dictionary.put("diams", "\u2666"); // Black diamond suit |
| dictionary.put("loz", "\u25CA"); // Lozenge |
| |
| // Miscellaneous Technical Symbols |
| dictionary.put("lceil", "\u2308"); // Left ceiling; Apl upstile |
| dictionary.put("rceil", "\u2309"); // Right ceiling |
| dictionary.put("lfloor", "\u230A"); // Left floor; Apl downstile |
| dictionary.put("rfloor", "\u230B"); // Right floor |
| dictionary.put("lang", "\u2329"); // Left-pointing angle bracket |
| dictionary.put("rang", "\u232A"); // Right-pointing angle bracket |
| |
| // Spacing Modifier Characters and Bi-directional Characters |
| dictionary.put("circ", "\u02C6"); // Modifier letter circumflex accent |
| dictionary.put("tilde", "\u02DC"); // Small tilde |
| dictionary.put("zwnj", "\u200C"); // Zero width non-joiner |
| dictionary.put("zwj", "\u200D"); // Zero width joiner |
| dictionary.put("lrm", "\u200E"); // Left-to-right mark |
| dictionary.put("rlm", "\u200F"); // Right-to-left mark |
| |
| // General Punctuation Set 1 |
| dictionary.put("bull", "\u2022"); // Bullet; Black small circle |
| dictionary.put("hellip", "\u2026"); // Horizontal ellipsis; Three dot leader |
| dictionary.put("prime", "\u2032"); // Prime; Minutes; Feet |
| dictionary.put("Prime", "\u2033"); // Double prime; Seconds; Inches |
| dictionary.put("oline", "\u203E"); // Overline; Spacing overscore |
| dictionary.put("frasl", "\u2044"); // Fraction slash |
| |
| // General Punctuation Set 2 |
| dictionary.put("ensp", "\u2002"); // En space |
| dictionary.put("emsp", "\u2003"); // Em space |
| dictionary.put("thinsp", "\u2009"); // Thin space |
| dictionary.put("zwnj", "\u200C"); // Zero width non-joiner |
| dictionary.put("zwj", "\u200D"); // Zero width joiner |
| dictionary.put("lrm", "\u200E"); // Left-to-right mark |
| dictionary.put("rlm", "\u200F"); // Right-to-left mark |
| dictionary.put("ndash", "\u2013"); // En dash |
| dictionary.put("mdash", "\u2014"); // Em dash |
| dictionary.put("lsquo", "\u2018"); // Left single quotation mark |
| dictionary.put("rsquo", "\u2019"); // Right single quotation mark |
| dictionary.put("sbquo", "\u201A"); // Single low-9 quotation mark |
| dictionary.put("ldquo", "\u201C"); // Left double quotation mark |
| dictionary.put("rdquo", "\u201D"); // Right double quotation mark |
| dictionary.put("bdquo", "\u201E"); // Double low-9 quotation mark |
| dictionary.put("dagger", "\u2020"); // Dagger |
| dictionary.put("Dagger", "\u2021"); // Double dagger |
| dictionary.put("permil", "\u2030"); // Per mille sign |
| dictionary.put("lsaquo", "\u2039"); // Single left-pointing angle quotation mark |
| dictionary.put("rsaquo", "\u203A"); // Single right-pointing angle quotation mark |
| dictionary.put("euro", "\u20AC"); // Euro |
| |
| return dictionary; |
| } |
| |
| /** |
| * Converts the characters that are reserved in an XML document the given string may have into |
| * their corresponding references (escape characters) using the character entity reference. |
| * |
| * @param value A string that may contain characters that need to be escaped |
| * @param positions This array of length one or two can be used to adjust the position of the |
| * cursor or a text range within the string during the conversion of the reserved characters |
| * @return The given string with any reserved characters converted into the escape characters |
| */ |
| public static String escape(String value, int[] positions) { |
| |
| if ((value == null) || (value.length() == 0)) { |
| return value; |
| } |
| |
| StringBuilder sb = new StringBuilder(value.length()); |
| int startPosition = positions[0]; |
| int endPosition = (positions.length > 1) ? positions[1] : -1; |
| |
| for (int index = 0, count = value.length(); index < count; index++) { |
| |
| char character = value.charAt(index); |
| |
| // The character is one of the reserved character |
| if (isReserved(character)) { |
| |
| // Retrieve the corresponding entity name |
| String name = getEscapeCharacter(character); |
| sb.append(name); |
| |
| // Adjust the position |
| if (startPosition > index) { |
| // -1 for the character itself that is replaced by the entity name |
| positions[0] += (name.length() - 1); |
| } |
| |
| if ((endPosition > -1) && (index < endPosition)) { |
| // -1 for the character itself that is replaced by the entity name |
| positions[1] += (name.length() - 1); |
| } |
| } |
| else { |
| sb.append(character); |
| } |
| } |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Returns the Unicode character for the given reference (which is either a numeric character |
| * reference or a character entity reference). |
| * |
| * @param reference The numeric character or character entity reference stripped of the leading |
| * ampersand and trailing semi-colon |
| * @return The Unicode character mapped to the given reference or <code>null</code> if the |
| * reference is invalid or unknown |
| */ |
| public static String getCharacter(String reference) { |
| |
| if (reference == null) { |
| return null; |
| } |
| |
| int length = reference.length(); |
| |
| if (length == 0) { |
| return null; |
| } |
| |
| // Character reference |
| if (reference.charAt(0) == '#') { |
| |
| if (length == 1) { |
| return null; |
| } |
| |
| // Parse the numeric value |
| String value; |
| int radix; |
| |
| // Hexadecimal |
| if (reference.charAt(1) == 'x') { |
| radix = 16; |
| value = reference.substring(2); |
| } |
| // Decimal |
| else { |
| radix = 10; |
| value = reference.substring(1); |
| } |
| |
| // No minus accepted |
| if ((value.length() == 0) || (value.charAt(0) == '-')) { |
| return null; |
| } |
| |
| // Convert the numeric value into the actual character |
| char character = 0; |
| |
| try { |
| character = (char) Integer.parseInt(value, radix); |
| } |
| catch (NumberFormatException ex) { |
| // Simply ignore |
| } |
| |
| // The null character � is not permitted |
| if (character == 0) { |
| return null; |
| } |
| |
| return String.valueOf(character); |
| } |
| |
| // Entity reference |
| return dictionary.get(reference); |
| } |
| |
| /** |
| * Returns the escaped character for the given reserved character. |
| * |
| * @param character The reserved character to retrieve its escape character with the entity name |
| * @return The escape character with the entity name of the given character if it is a reserved |
| * character; otherwise returns <code>null</code> |
| */ |
| public static String getEscapeCharacter(char character) { |
| |
| switch (character) { |
| case '<': return LESS_THAN_ENTITY_NAME; |
| case '>': return GREATER_THAN_ENTITY_NAME; |
| case '&': return AMPERSAND_ENTITY_NAME; |
| case '\'': return APOSTROPHE_ENTITY_NAME; |
| case '\"': return QUOTATION_MARK_NAME; |
| default: return null; |
| } |
| } |
| |
| /** |
| * Determines if the given character is one of the XML/HTML reserved characters. |
| * |
| * @param character The character to verify if it's one of the reserved characters |
| * @return <code>true</code> if the given character is defined as a reserved characters; |
| * <code>false</code> otherwise |
| */ |
| public static boolean isReserved(char character) { |
| |
| switch (character) { |
| case '<': |
| case '>': |
| case '&': |
| case '\'': |
| case '\"': return true; |
| default: return false; |
| } |
| } |
| |
| /** |
| * Re-adjusts the given positions, which is based on the non-escaped version of the given |
| * <em>query</em>, by making sure it is pointing at the same position within <em>query</em>, |
| * which contains references (escape characters). |
| * <p> |
| * The escape characters are either the character entity references or the numeric character |
| * references used in an XML document. |
| * <p> |
| * <b>Important:</b> The given query should contain the exact same amount of whitespace than the |
| * query used to calculate the given positions. |
| * |
| * @param query The query that may contain escape characters |
| * @param positions The position within the non-escaped version of the given query, which is |
| * either a single element position or two positions that is used as a text range. After execution |
| * contains the adjusted positions by moving it based on the difference between the escape and |
| * non-escaped versions of the query |
| * @since 2.5 |
| */ |
| public static void reposition(CharSequence query, int[] positions) { |
| |
| if ((query == null) || (query.length() == 0)) { |
| return; |
| } |
| |
| StringBuilder sb = new StringBuilder(query); |
| |
| for (int index = 0, count = sb.length(); index < count; index++) { |
| |
| char character = sb.charAt(index); |
| |
| // The beginning of the escape character |
| if ((character == '&') && (index + 1 < count)) { |
| |
| // Find the ending of the escape character |
| int semiColonIndex = sb.indexOf(";", index + 1); |
| |
| if (semiColonIndex > -1) { |
| |
| // Retrieve the reference value |
| String reference = sb.substring(index + 1, semiColonIndex); |
| |
| if (reference.length() > 0) { |
| |
| // Retrieve the character mapped to the entity name |
| String unicodeCharacter = XmlEscapeCharacterConverter.getCharacter(reference); |
| |
| if (unicodeCharacter != null) { |
| |
| // length = '&' + 'reference' + ';' - 'Unicode character' |
| int length = (semiColonIndex - index); |
| |
| // Translate both positions because a Unicode |
| // character is written with its escape character |
| if (index < positions[0]) { |
| positions[0] += length; |
| positions[1] += length; |
| } |
| // Only translate the end position because the start |
| // position is before the current index |
| else if (index < positions[1]) { |
| positions[1] += length; |
| } |
| |
| index = semiColonIndex; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Converts the references (escape characters) the given string may have into their corresponding |
| * Unicode characters. |
| * |
| * <ul> |
| * <li>Character entity reference: <b>&copy;</b> for <b>©</b></li> |
| * <li>Numeric character reference (decimal value): <b>&#169;</b> for <b>©</b></li> |
| * <li>Numeric character reference (hexadecimal value): <b>&#xA9;</b> for <b>©</b></li> |
| * </ul> |
| * |
| * @param value A string that may contain escape characters |
| * @param position This array of length one can be used to adjust the position of the cursor |
| * within the string during the conversion of the escape characters |
| * @return The given string with any escape characters converted into the actual Unicode characters |
| */ |
| public static String unescape(String value, int[] position) { |
| |
| if ((value == null) || (value.length() == 0)) { |
| return value; |
| } |
| |
| StringBuilder sb = new StringBuilder(value); |
| |
| for (int index = 0, count = sb.length(); index < count; index++) { |
| |
| char character = sb.charAt(index); |
| |
| // The beginning of the escape character |
| if ((character == '&') && (index + 1 < count)) { |
| |
| // Find the ending of the escape character |
| int semiColonIndex = sb.indexOf(";", index + 1); |
| |
| if (semiColonIndex > -1) { |
| |
| // Retrieve the reference |
| String reference = sb.substring(index + 1, semiColonIndex); |
| |
| if (reference.length() > 0) { |
| |
| // Retrieve the character mapped to the reference |
| String specialCharacter = getCharacter(reference); |
| |
| if (specialCharacter != null) { |
| |
| // Replace the reference by the Unicode character |
| sb.replace(index, semiColonIndex + 1, specialCharacter); |
| |
| // Make sure the count is updated |
| count -= (semiColonIndex - index); |
| |
| // "& + reference + ; - Unicode character" |
| int length = (1 + reference.length()); |
| |
| // Adjust the position |
| // Case 1: The cursor is within the escape character, move it to the beginning |
| if ((position[0] >= index) && (position[0] <= index + length)) { |
| position[0] = index; |
| } |
| // Case 2: the cursor is after the escape character, just do an adjustment as |
| // if it was a single character |
| else if (position[0] > index + length) { |
| position[0] -= length; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| return sb.toString(); |
| } |
| } |