Unescape HTML special characters from a String

From CodeCodex

Conversion of HTML character entity references to the literal characters.

Contents

Implementations

C++

The following UnquoteHTML routine will, for example, translate "&", "&" and "&" all into "&".

typedef struct
  {
    const char * Name;
    unsigned int Value;
  } EntityNameEntry;
static const EntityNameEntry StaticEntityNames[] =
  /* list of entity names defined in HTML 4.0 spec */
  {
    {"nbsp", 160},
    {"iexcl", 161},
    {"cent", 162},
    {"pound", 163},
    {"curren", 164},
    {"yen", 165},
    {"brvbar", 166},
    {"sect", 167},
    {"uml", 168},
    {"copy", 169},
    {"ordf", 170},
    {"laquo", 171},
    {"not", 172},
    {"shy", 173},
    {"reg", 174},
    {"macr", 175},
    {"deg", 176},
    {"plusmn", 177},
    {"sup2", 178},
    {"sup3", 179},
    {"acute", 180},
    {"micro", 181},
    {"para", 182},
    {"middot", 183},
    {"cedil", 184},
    {"sup1", 185},
    {"ordm", 186},
    {"raquo", 187},
    {"frac14", 188},
    {"frac12", 189},
    {"frac34", 190},
    {"iquest", 191},
    {"Agrave", 192},
    {"Aacute", 193},
    {"Acirc", 194},
    {"Atilde", 195},
    {"Auml", 196},
    {"Aring", 197},
    {"AElig", 198},
    {"Ccedil", 199},
    {"Egrave", 200},
    {"Eacute", 201},
    {"Ecirc", 202},
    {"Euml", 203},
    {"Igrave", 204},
    {"Iacute", 205},
    {"Icirc", 206},
    {"Iuml", 207},
    {"ETH", 208},
    {"Ntilde", 209},
    {"Ograve", 210},
    {"Oacute", 211},
    {"Ocirc", 212},
    {"Otilde", 213},
    {"Ouml", 214},
    {"times", 215},
    {"Oslash", 216},
    {"Ugrave", 217},
    {"Uacute", 218},
    {"Ucirc", 219},
    {"Uuml", 220},
    {"Yacute", 221},
    {"THORN", 222},
    {"szlig", 223},
    {"agrave", 224},
    {"aacute", 225},
    {"acirc", 226},
    {"atilde", 227},
    {"auml", 228},
    {"aring", 229},
    {"aelig", 230},
    {"ccedil", 231},
    {"egrave", 232},
    {"eacute", 233},
    {"ecirc", 234},
    {"euml", 235},
    {"igrave", 236},
    {"iacute", 237},
    {"icirc", 238},
    {"iuml", 239},
    {"eth", 240},
    {"ntilde", 241},
    {"ograve", 242},
    {"oacute", 243},
    {"ocirc", 244},
    {"otilde", 245},
    {"ouml", 246},
    {"divide", 247},
    {"oslash", 248},
    {"ugrave", 249},
    {"uacute", 250},
    {"ucirc", 251},
    {"uuml", 252},
    {"yacute", 253},
    {"thorn", 254},
    {"yuml", 255},
    {"fnof", 402},
  /* Greek */
    {"Alpha", 913},
    {"Beta", 914},
    {"Gamma", 915},
    {"Delta", 916},
    {"Epsilon", 917},
    {"Zeta", 918},
    {"Eta", 919},
    {"Theta", 920},
    {"Iota", 921},
    {"Kappa", 922},
    {"Lambda", 923},
    {"Mu", 924},
    {"Nu", 925},
    {"Xi", 926},
    {"Omicron", 927},
    {"Pi", 928},
    {"Rho", 929},
    {"Sigma", 931},
    {"Tau", 932},
    {"Upsilon", 933},
    {"Phi", 934},
    {"Chi", 935},
    {"Psi", 936},
    {"Omega", 937},
    {"alpha", 945},
    {"beta", 946},
    {"gamma", 947},
    {"delta", 948},
    {"epsilon", 949},
    {"zeta", 950},
    {"eta", 951},
    {"theta", 952},
    {"iota", 953},
    {"kappa", 954},
    {"lambda", 955},
    {"mu", 956},
    {"nu", 957},
    {"xi", 958},
    {"omicron", 959},
    {"pi", 960},
    {"rho", 961},
    {"sigmaf", 962},
    {"sigma", 963},
    {"tau", 964},
    {"upsilon", 965},
    {"phi", 966},
    {"chi", 967},
    {"psi", 968},
    {"omega", 969},
    {"thetasym", 977},
    {"upsih", 978},
    {"piv", 982},
  /* General Punctuation */
    {"bull", 8226},
    {"hellip", 8230},
    {"prime", 8242},
    {"Prime", 8243},
    {"oline", 8254},
    {"frasl", 8260},
  /* Letterlike Symbols */
    {"weierp", 8472},
    {"image", 8465},
    {"real", 8476},
    {"trade", 8482},
    {"alefsym", 8501},
  /* Arrows */
    {"larr", 8592},
    {"uarr", 8593},
    {"rarr", 8594},
    {"darr", 8595},
    {"harr", 8596},
    {"crarr", 8629},
    {"lArr", 8656},
    {"uArr", 8657},
    {"rArr", 8658},
    {"dArr", 8659},
    {"hArr", 8660},
  /* Mathematical Operators */
    {"forall", 8704},
    {"part", 8706},
    {"exist", 8707},
    {"empty", 8709},
    {"nabla", 8711},
    {"isin", 8712},
    {"notin", 8713},
    {"ni", 8715},
    {"prod", 8719},
    {"sum", 8721},
    {"minus", 8722},
    {"lowast", 8727},
    {"radic", 8730},
    {"prop", 8733},
    {"infin", 8734},
    {"and", 8743},
    {"or", 8744},
    {"cap", 8745},
    {"cup", 8746},
    {"int", 8747},
    {"there4", 8756},
    {"sim", 8764},
    {"cong", 8773},
    {"asymp", 8776},
    {"ne", 8800},
    {"equiv", 8801},
    {"le", 8804},
    {"ge", 8805},
    {"sub", 8834},
    {"sup", 8835},
    {"nsub", 8836},
    {"sube", 8838},
    {"supe", 8839},
    {"oplus", 8853},
    {"otimes", 8855},
    {"perp", 8869},
    {"sdot", 8901},
  /* Miscellaneous Technical */
    {"lceil", 8968},
    {"rceil", 8969},
    {"lfloor", 8970},
    {"rfloor", 8971},
    {"lang", 9001},
    {"rang", 9002},
  /* Geometric Shapes */
    {"loz", 9674},
  /* Miscellaneous Symbols */
    {"spades", 9824},
    {"clubs", 9827},
    {"hearts", 9829},
    {"diams", 9830},
    {"quot", 34},
    {"amp", 38},
    {"lt", 60},
    {"gt", 62},
  /* Latin Extended-A */
    {"OElig", 338},
    {"oelig", 339},
    {"Scaron", 352},
    {"scaron", 353},
    {"Yuml", 376},
  /* Spacing Modifier Letters */
    {"circ", 710},
    {"tilde", 732},
  /* General Punctuation */
    {"ensp", 8194},
    {"emsp", 8195},
    {"thinsp", 8201},
    {"zwnj", 8204},
    {"zwj", 8205},
    {"lrm", 8206},
    {"rlm", 8207},
    {"ndash", 8211},
    {"mdash", 8212},
    {"lsquo", 8216},
    {"rsquo", 8217},
    {"sbquo", 8218},
    {"ldquo", 8220},
    {"rdquo", 8221},
    {"bdquo", 8222},
    {"dagger", 8224},
    {"Dagger", 8225},
    {"permil", 8240},
    {"lsaquo", 8249},
    {"rsaquo", 8250},
    {"euro", 8364},
    {NULL, 0} /* marks end of list */
  } /*StaticEntityNames*/;

typedef std::map<std::string, unsigned int>
    EntityNameMap;
typedef std::pair<std::string, unsigned int>
    EntityNamePair;
static EntityNameMap
    EntityNames;

static void WriteUTF8
  (
    std::ostream & Out,
    unsigned int Ch
  )
  /* writes Ch in UTF-8 encoding to Out. Note this version only deals
    with characters up to 16 bits. */
  {
    if (Ch >= 0x800)
      {
        Out.put(0xE0 | Ch >> 12 & 0x0F);
        Out.put(0x80 | Ch >> 6 & 0x3F);
        Out.put(0x80 | Ch & 0x3F);
      }
    else if (Ch >= 0x80)
      {
        Out.put(0xC0 | Ch >> 6 & 0x1F);
        Out.put(0x80 | Ch & 0x3F);
      }
    else
      {
        Out.put(Ch);
      } /*if*/
  } /*WriteUTF8*/

void UnquoteHTML
  (
    std::istream & In,
    std::ostream & Out
  )
  /* copies In to Out, expanding any HTML entity references into literal
    UTF-8 characters. */
  {
    enum
      {
        NoMatch,
        MatchBegin,
        MatchName,
        MatchNumber,
        MatchDecimalNumber,
        MatchHexNumber,
      } MatchState;
    std::string MatchingName;
    unsigned int CharCode;
    bool ProcessedChar, GotCharCode;
    MatchState = NoMatch;
    for (;;)
      {
        const unsigned char ThisCh = In.get();
        if (In.eof())
            break;
        ProcessedChar = false; /* to begin with */
        GotCharCode = false; /* to begin with */
        switch (MatchState)
          {
        case MatchBegin:
            if (ThisCh == '#')
              {
                MatchState = MatchNumber;
                ProcessedChar = true;
              }
            else if
              (
                    ThisCh >= 'a' and ThisCh <= 'z'
                or
                    ThisCh >= 'A' and ThisCh <= 'Z'
              )
              {
                MatchingName.append(1, ThisCh);
                MatchState = MatchName;
                ProcessedChar = true;
              }
            else
              {
                Out.put('&');
                MatchState = NoMatch;
              } /*if*/
        break;
        case MatchName:
            if
              (
                    ThisCh >= 'a' and ThisCh <= 'z'
                or
                    ThisCh >= 'A' and ThisCh <= 'Z'
                or
                    ThisCh >= '0' and ThisCh <= '9'
              )
              {
                MatchingName.append(1, ThisCh);
                ProcessedChar = true;
              }
            else if (ThisCh == ';')
              {
                if (EntityNames.empty())
                  {
                  /* first use, load EntityNames from StaticEntityNames */
                    const EntityNameEntry * ThisEntry;
                    ThisEntry = StaticEntityNames;
                    for (;;)
                      {
                        if (ThisEntry->Name == NULL)
                            break;
                        EntityNames.insert
                          (
                            EntityNamePair(std::string(ThisEntry->Name), ThisEntry->Value)
                          );
                        ++ThisEntry;
                      } /*for*/
                  } /*if*/
                const EntityNameMap::const_iterator NameEntry = EntityNames.find(MatchingName);
                if (NameEntry != EntityNames.end())
                  {
                    CharCode = NameEntry->second;
                    ProcessedChar = true;
                    GotCharCode = true;
                  } /*if*/
              } /*if*/
            if (not ProcessedChar)
              {
                Out.put('&');
                for (unsigned int i = 0; i < MatchingName.size(); ++i)
                  {
                    Out.put(MatchingName[i]);
                  } /*for*/
                MatchState = NoMatch;
              } /*if*/
        break;
        case MatchNumber:
            if (ThisCh == 'x' or ThisCh == 'X')
              {
                ProcessedChar = true;
                MatchState = MatchHexNumber;
                CharCode = 0;
              }
            else if (ThisCh >= '0' and ThisCh <= '9')
              {
                CharCode = ThisCh - '0';
                MatchState = MatchDecimalNumber;
                ProcessedChar = true;
              }
            else
              {
                MatchState = NoMatch;
              } /*if*/
        break;
        case MatchDecimalNumber:
            if (ThisCh >= '0' and ThisCh <= '9')
              {
                CharCode = CharCode * 10 + ThisCh - '0';
                ProcessedChar = true;
              }
            else if (ThisCh == ';')
              {
                ProcessedChar = true;
                GotCharCode = true;
              }
            else
              {
                MatchState = NoMatch;
              } /*if*/
        break;
        case MatchHexNumber:
            if (ThisCh >= '0' and ThisCh <= '9')
              {
                CharCode = CharCode * 16 + ThisCh - '0';
                ProcessedChar = true;
              }
            else if (ThisCh >= 'a' and ThisCh <= 'f')
              {
                CharCode = CharCode * 16 + ThisCh - 'a' + 10;
                ProcessedChar = true;
              }
            else if (ThisCh >= 'A' and ThisCh <= 'F')
              {
                CharCode = CharCode * 16 + ThisCh - 'A' + 10;
                ProcessedChar = true;
              }
            else if (ThisCh == ';')
              {
                ProcessedChar = true;
                GotCharCode = true;
              }
            else
              {
                MatchState = NoMatch;
              } /*if*/
        break;
          } /*switch*/
        if (GotCharCode)
          {
            WriteUTF8(Out, CharCode);   
            MatchState = NoMatch;
          }
        else if (not ProcessedChar and MatchState == NoMatch)
          {
            if (ThisCh == '&')
              {
                MatchState = MatchBegin;
                MatchingName.erase();
              }
            else
              {
                Out.put(ThisCh);
              } /*if*/
          } /*if*/
      } /*for*/
  } /*UnquoteHTML*/

Java

  public static final String unescapeHTML(String s, int f){
    String [][] escape =
     {{  "&lt;"     , "<" } ,
      {  "&gt;"     , ">" } ,
      {  "&amp;"    , "&" } ,
      {  "&quot;"   , "\"" } ,
      {  "&aacute;"  , "á" } ,
      {  "&Aacute;"  , "Á" } ,
      {  "&eacute;"  , "é" } ,
      {  "&Eacute;"  , "É" } ,
      {  "&iacute;"  , "í" } ,
      {  "&Iacute;"  , "Í" } ,
      {  "&oacute;"  , "ó" } ,
      {  "&Oacute;"  , "Ó" } ,
      {  "&uacute;"  , "ú" } ,
      {  "&Uacute;"  , "Ú" } ,
      {  "&Ntilde;"  , "ñ" } ,
      {  "&Ntilde;"  , "Ñ" } ,
      {  "&apos;"   , "'" } ,
      {  "&deg;;"   , "º" } ,
      {  "&agrave;" , "à" } ,
      {  "&Agrave;" , "À" } ,
      {  "&acirc;"  , "â" } ,
      {  "&auml;"   , "ä" } ,
      {  "&Auml;"   , "Ä" } ,
      {  "&Acirc;"  , "Â" } ,
      {  "&aring;"  , "å" } ,
      {  "&Aring;"  , "Å" } , 
      {  "&aelig;"  , "æ" } , 
      {  "&AElig;"  , "Æ" } ,
      {  "&ccedil;" , "ç" } ,
      {  "&Ccedil;" , "Ç" } ,
      {  "&eacute;" , "é" } ,
      {  "&Eacute;" , "É" } ,
      {  "&egrave;" , "è" } ,
      {  "&Egrave;" , "È" } ,
      {  "&ecirc;"  , "ê" } ,
      {  "&Ecirc;"  , "Ê" } ,
      {  "&euml;"   , "ë" } ,
      {  "&Euml;"   , "Ë" } ,
      {  "&iuml;"   , "ï" } , 
      {  "&Iuml;"   , "Ï" } ,
      {  "&ocirc;"  , "ô" } ,
      {  "&Ocirc;"  , "Ô" } ,
      {  "&ouml;"   , "ö" } ,
      {  "&Ouml;"   , "Ö" } ,
      {  "&oslash;" , "ø" } ,
      {  "&Oslash;" , "Ø" } ,
      {  "&szlig;"  , "ß" } ,
      {  "&ugrave;" , "ù" } ,
      {  "&Ugrave;" , "Ù" } ,
      {  "&ucirc;"  , "û" } ,
      {  "&Ucirc;"  , "Û" } , 
      {  "&uuml;"   , "ü" } ,
      {  "&Uuml;"   , "Ü" } ,
      {  "&nbsp;"   , " " } ,
      {  "&reg;"    , "\u00a9" } ,
      {  "&copy;"   , "\u00ae" } ,
      {  "&euro;"   , "\u20a0" } };
     int i, j, k, l ;
     
     i = s.indexOf("&", f);
     if (i > -1) {
        j = s.indexOf(";" ,i);
        // --------
        // we don't start from the beginning 
        // the next time, to handle the case of
        // the &
        // thanks to Pieter Hertogh for the bug fix!
        f = i + 1;
        // --------
        if (j > i) {
           // ok this is not most optimized way to
           // do it, a StringBuffer would be better,
           // this is left as an exercise to the reader!
           String temp = s.substring(i , j + 1);
           // search in escape[][] if temp is there
           k = 0;
           while (k < escape.length) {
             if (escape[k][0].equals(temp)) break;
             else k++;
             }
           if (k < escape.length) {
             s = s.substring(0 , i) + escape[k][1] + s.substring(j + 1);
             return unescapeHTML(s, f); // recursive call
             }
           }
        }   
     return s;
     }
  • Original source (Java version): [1]
  • Another Java routine is here.

Perl

use HTML::Entities qw(decode_entities);
my $decoded_s = decode_entities($s);

$decoded_s will contain the string from $s, with all of the escaped HTML entities converted to their unescaped forms.