Difference between revisions of "Convert HTML to text"

From CodeCodex

(Undo revision 5948 by 71.122.253.195 (talk))
Line 1: Line 1:
<td class="ConfirmLeft">Order Number</td>
+
{{Infobox See Also String}}
 +
==Implementations==
 +
===Java===
 +
<pre><nowiki>
 +
package de.silpion.senior.email.util;
  
      <td class="ConfirmRight">EAT3S2RBWK</td>
+
import java.io.File;
    </tr>
+
import java.io.FileInputStream;
 +
import java.io.IOException;
 +
import java.io.Reader;
 +
import java.io.StringReader;
  
    <tr>
+
/**
      <td class="ConfirmLeft">Phone Number </td>
+
* Convert text/html into text/plain
      <td class="ConfirmRight">813-944-3466</td>
+
*
    </tr>
+
* @author Krishna Singhania
 +
*
 +
* @version 1.2 $Date: July 31, 2008 $
 +
*/
 +
public class HTML2Text {
  
     <tr>
+
     boolean body_found = false;
      <td class="ConfirmLeft">Online Account Username</td>
+
    boolean in_body = false;
      <td class="ConfirmRight">gridonfae</td>
+
     boolean center = false;
     </tr>
+
     boolean pre = false;
      
+
     String href = "";
     <tr>
+
      <td class="ConfirmLeft">Security PIN</td>
+
      <td class="ConfirmRight">1234</td>
+
  
     </tr>
+
     public String convert(String source) throws Exception {
   
+
        StringBuffer result = new StringBuffer();
    <tr>
+
        StringBuffer result2 = new StringBuffer();
  <td id="securityQuestion" class="ConfirmLeft">Who is your favorite actor?</td>
+
        StringReader input = new StringReader(source);
  <td id="securityAnswer" class="ConfirmRight">vonage1</td>
+
</tr>
+
  
  </tbody></table>
+
        try {
</div>
+
            String text = null;
 +
            int c = input.read();
  
<ul>
+
            while (c != -1) // Convert until EOF
 +
            {
 +
                text = "";
 +
                if (c == '<') // It's a tag!!
 +
                {
 +
                    String CurrentTag = getTag(input); // Get the rest of the tag
 +
                    text = convertTag(CurrentTag);
 +
                } else if (c == '&') {
 +
                    String specialchar = getSpecial(input);
 +
                    if (specialchar.equals("lt;") || specialchar.equals("#60")) {
 +
                        text = "<";
 +
                    } else if (specialchar.equals("gt;") || specialchar.equals("#62")) {
 +
                        text = ">";
 +
                    } else if (specialchar.equals("amp;") || specialchar.equals("#38")) {
 +
                        text = "&";
 +
                    } else if (specialchar.equals("nbsp;")) {
 +
                        text = " ";
 +
                    } else if (specialchar.equals("quot;") || specialchar.equals("#34")) {
 +
                        text = "\"";
 +
                    } else if (specialchar.equals("copy;") || specialchar.equals("#169")) {
 +
                        text = "[Copyright]";
 +
                    } else if (specialchar.equals("reg;") || specialchar.equals("#174")) {
 +
                        text = "[Registered]";
 +
                    } else if (specialchar.equals("trade;") || specialchar.equals("#153")) {
 +
                        text = "[Trademark]";
 +
                    } else {
 +
                        text = "&" + specialchar;
 +
                    }
 +
                } else if (!pre && Character.isWhitespace((char) c)) {
 +
                    StringBuffer s = in_body ? result : result2;
 +
                    if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length() - 1))) {
 +
                        text = "";
 +
                    } else {
 +
                        text = " ";
 +
                    }
 +
                } else {
 +
                    text = "" + (char) c;
 +
                }
  
 +
                StringBuffer s = in_body ? result : result2;
 +
                s.append(text);
  
 +
                c = input.read();
 +
            }
 +
        } catch (Exception e) {
 +
            input.close();
 +
            throw e;
 +
        }
  
  <li>View and Track the shipment of your order by logging into your <a id="sign_in_link" href="https://secure.vonage.com/vonage-web/dashboard/orders.htm" target="_blank">Vonage Online Account</a></li>
+
        StringBuffer s = body_found ? result : result2;
 +
        return s.toString().trim();
 +
    }
  
  <li>You will receive an email with your order confirmation.</li>
+
    String getTag(Reader r) throws IOException {
 +
        StringBuffer result = new StringBuffer();
 +
        int level = 1;
  
  <li>If you have any questions contact <a href="http://www.vonage.com/help_contactUs.php?lid=footer_contact" target="_blank" id="contact_us_link">Vonage Customer Care</a> or call <strong>1-VONAGE-HELP (1-866-243-4357)</strong>.</li>
+
        result.append('<');
 +
        while (level > 0) {
 +
            int c = r.read();
 +
            if (c == -1) {
 +
                break; // EOF
 +
            }
 +
            result.append((char) c);
 +
            if (c == '<') {
 +
                level++;
 +
            } else if (c == '>') {
 +
                level--;
 +
            }
 +
        }
  
 +
        return result.toString();
 +
    }
  
 +
    String getSpecial(Reader r) throws IOException {
 +
        StringBuffer result = new StringBuffer();
 +
        r.mark(1); //Mark the present position in the stream
 +
        int c = r.read();
  
 +
        while (Character.isLetter((char) c)) {
 +
            result.append((char) c);
 +
            r.mark(1);
 +
            c = r.read();
 +
        }
  
  <div>
+
        if (c == ';') {
    Important Note: Do not unplug the power supply during the first 5 minutes after the device is turned on and connected to the Internet. Any power interruption during this time may cause the device to become defective.
+
            result.append(';');
  </div>
+
        } else {
 +
            r.reset();
 +
        }
 +
        return result.toString();
 +
    }
  
 +
    boolean isTag(String s1, String s2) {
 +
        s1 = s1.toLowerCase();
 +
        String t1 = "<" + s2.toLowerCase() + ">";
 +
        String t2 = "<" + s2.toLowerCase() + " ";
  
</ul>
+
        return s1.startsWith(t1) || s1.startsWith(t2);
 +
    }
  
     <table class="SummaryTable" border="0" cellpadding="0" cellspacing="0">
+
     String convertTag(String t) throws IOException {
 +
        String result = "";
  
  <tbody><tr>
+
        if (isTag(t, "body")) {
 +
            in_body = true;
 +
            body_found = true;
 +
        } else if (isTag(t, "/body")) {
 +
            in_body = false;
 +
            result = "";
 +
        } else if (isTag(t, "center")) {
 +
            result = "";
 +
            center = true;
 +
        } else if (isTag(t, "/center")) {
 +
            result = "";
 +
            center = false;
 +
        } else if (isTag(t, "pre")) {
 +
            result = "";
 +
            pre = true;
 +
        } else if (isTag(t, "/pre")) {
 +
            result = "";
 +
            pre = false;
 +
        } else if (isTag(t, "p")) {
 +
            result = "";
 +
        } else if (isTag(t, "br")) {
 +
            result = "";
 +
        } else if (isTag(t, "h1") || isTag(t, "h2") || isTag(t, "h3") || isTag(t, "h4") || isTag(t, "h5") || isTag(t, "h6") || isTag(t, "h7")) {
 +
            result = "";
 +
        } else if (isTag(t, "/h1") || isTag(t, "/h2") || isTag(t, "/h3") || isTag(t, "/h4") || isTag(t, "/h5") || isTag(t, "/h6") || isTag(t, "/h7")) {
 +
            result = "";
 +
        } else if (isTag(t, "/dl")) {
 +
            result = "";
 +
        } else if (isTag(t, "dd")) {
 +
            result = "  * ";
 +
        } else if (isTag(t, "dt")) {
 +
            result = "      ";
 +
        } else if (isTag(t, "li")) {
 +
            result = "  * ";
 +
        } else if (isTag(t, "/ul")) {
 +
            result = "";
 +
        } else if (isTag(t, "/ol")) {
 +
            result = "";
 +
        } else if (isTag(t, "hr")) {
 +
            result = "_________________________________________";
 +
        } else if (isTag(t, "table")) {
 +
            result = "";
 +
        } else if (isTag(t, "/table")) {
 +
            result = "";
 +
        } else if (isTag(t, "form")) {
 +
            result = "";
 +
        } else if (isTag(t, "/form")) {
 +
            result = "";
 +
        } else if (isTag(t, "b")) {
 +
            result = "*";
 +
        } else if (isTag(t, "/b")) {
 +
            result = "*";
 +
        } else if (isTag(t, "i")) {
 +
            result = "\"";
 +
        } else if (isTag(t, "/i")) {
 +
            result = "\"";
 +
        } else if (isTag(t, "img")) {
 +
            int idx = t.indexOf("alt=\"");
 +
            if (idx != -1) {
 +
                idx += 5;
 +
                int idx2 = t.indexOf("/\"", idx);
 +
                result = t.substring(idx, idx2);
 +
            }
 +
        } else if (isTag(t, "a")) {
 +
            int idx = t.indexOf("href=\"");
 +
            if (idx != -1) {
 +
                idx += 6;
 +
                int idx2 = t.indexOf("/\"", idx);
 +
                href = t.substring(idx, idx2);
 +
            } else {
 +
                href = "";
 +
            }
 +
        } else if (isTag(t, "/a")) {
 +
            if (href.length() > 0) {
 +
                result = " [ " + href + " ]";
 +
                href = "";
 +
            }
 +
        }
  
    <td rowspan="2" class="GreyColumn">Plan Summary </td>
+
        return result;
    <td class="DescriptionColumn">Plan Name </td>
+
     }
    <td class="DetailsColumn">Vonage World Residential Plan</td>
+
     <td class="EditColumn">
+
  
          <a href="https://subscribe.vonage.com/plans" class="BlueURL" id="plan_edit">Edit</a>
+
    public static void main(String argv[]) throws Exception {
         </td>
+
        FileInputStream fis = null;
 +
         String s = null;
  
  </tr>
+
        try {
 +
            File file;
 +
            if (argv[0] != null) {
 +
                file = new File(argv[0]);
 +
            } else {
 +
                file = new File("html_test_file.html");
 +
            }
 +
            fis = new FileInputStream(file);
 +
            byte buf[] = new byte[fis.available()];
 +
            //bytes that can be read from this file input stream without blocking
  
  <tr>
+
            fis.read(buf);
 +
            fis.close();
 +
            fis = null;
 +
            s = new String(buf);
 +
            HTML2Text h = new HTML2Text();
 +
            System.out.println(h.convert(s));
 +
        } catch (Exception e) {
 +
            if (fis != null) {
 +
                fis.close();
 +
            }
 +
            throw e;
 +
        }
 +
    }
 +
}
 +
</nowiki></pre>
  
 
+
Original Source: [http://www.sourcecodesworld.com/source/show.asp?ScriptID=976]
    <td class="DescriptionColumn GreyBottomBorder">Phone Number </td>
+
Original Author: Rockey Mandy
    <td class="DetailsColumn GreyBottomBorder">813-944-3466</td>     
+
  
 
+
===Perl===
    <td class="EditColumn GreyBottomBorder">
+
<HIGHLIGHTSYNTAX language="perl">
      <a href="/order_details" class="BlueURL" id="select_number_edit">Edit</a>
+
use HTML::TreeBuilder qw();
     </td>
+
use HTML::FormatText qw();
 +
print HTML::FormatText->new->format(
 +
     HTML::TreeBuilder->new->parse_file('foo.html')
 +
);
 +
</HIGHLIGHTSYNTAX>
  
  </tr>
+
Another similar page is [[Unescape HTML special characters from a String]].
  
  <tr>
+
[[Category:HTML]]
    <td class="GreyColumn">Device Summary </td>
+
[[Category:Java]]
   
+
[[Category:Perl]]
   
+
[[Category:Quoting]]
    <td class="DescriptionColumn GreyBottomBorder">MAC Address</td>
+
[[Category:String]]
    <td class="DetailsColumn GreyBottomBorder">00242BB4F033</td>
+
   
+
 
+
    <td class="EditColumn GreyBottomBorder">
+
   
+
 
+
    </td>
+
 
+
  </tr>
+
 
+
  <tr>
+
    <td class="GreyColumn">911 Information </td>
+
    <td class="DescriptionColumn GreyBottomBorder">Dialing Address </td>
+
    <td class="DetailsColumn GreyBottomBorder">
+
    10101 WINDHORST ROAD<br>TAMPA, FL 33619      
+
    </td>
+
 
+
    <td class="EditColumn GreyBottomBorder">
+
      <a href="https://subscribe.vonage.com/emergency_address" class="BlueURL" id="emergency_edit">Edit</a>
+
    </td>
+
  </tr>
+
 
+
  <tr>
+
  <td rowspan="6" class="GreyColumn">Account&nbsp;Information</td>
+
  <td class="DescriptionColumn">Name </td>
+
 
+
  <td class="DetailsColumn">Faith Gridon</td>
+
  <td class="EditColumn">
+
    <a href="https://subscribe.vonage.com/account" class="BlueURL" id="account_edit">Edit</a>
+
  </td>
+
</tr>
+
 
+
 
+
 
+
<tr>
+
  <td class="DescriptionColumn">Username </td>
+
 
+
  <td class="DetailsColumn">gridonfae</td>
+
  <td class="EditColumn">&nbsp;</td>
+
</tr>
+
 
+
 
+
<tr>
+
  <td class="DescriptionColumn">Password </td>
+
  <td class="DetailsColumn">***********</td>
+
  <td class="EditColumn">&nbsp;</td>
+
 
+
</tr>
+
 
+
 
+
<tr>
+
  <td class="DescriptionColumn">Email </td>
+
  <td class="DetailsColumn">gridonfae@yahoo.com</td>
+
  <td class="EditColumn">&nbsp;</td>
+
</tr>
+
 
+
<tr>
+
  <td class="DescriptionColumn">Phone Number </td>
+
 
+
  <td class="DetailsColumn">(813) 658-7456</td>
+
  <td class="EditColumn">&nbsp;</td>
+
</tr>
+
 
+
 
+
 
+
 
+
<tr>
+
  <td id="securityQuestion" class="DescriptionColumn GreyBottomBorder">Who is your favorite actor?</td>
+
  <td id="securityAnswer" class="DetailsColumn GreyBottomBorder">vonage1</td>
+
 
+
  <td class="EditColumn GreyBottomBorder">&nbsp;</td>
+
</tr>
+
 
+
 
+
 
+
  <tr>
+
  <td rowspan="6" class="GreyColumn">Billing Information </td>
+
  <td class="DescriptionColumn">Name </td>
+
  <td class="DetailsColumn">Faith Gridon</td>
+
 
+
  <td class="EditColumn">&nbsp;</td>
+
</tr>
+
 
+
<tr>
+
  <td class="DescriptionColumn">Payment Option</td>
+
  <td class="DetailsColumn">Monthly</td>
+
  <td class="EditColumn">
+
    &nbsp;  </td>
+
</tr>
+
 
+
<tr>
+
  <td class="DescriptionColumn">Payment Type</td>
+
  <td class="DetailsColumn">Credit Card</td>
+
 
+
  <td class="EditColumn">
+
    <a href="https://subscribe.vonage.com/billing_and_shipping" class="BlueURL" id="billing_edit">Edit</a>
+
  </td>
+
 
+
</tr>
+
 
+
<tr>
+
  <td class="DescriptionColumn">Card Type</td>
+
  <td class="DetailsColumn">Mastercard</td>
+
  <td class="EditColumn">
+
    <a href="https://subscribe.vonage.com/billing_and_shipping" class="BlueURL" id="edit_checking_account">Edit</a>
+
  </td>
+
</tr>
+
 
+
<tr>
+
  <td class="DescriptionColumn">Card Number</td>
+
  <td class="DetailsColumn">************6325</td>
+
  <td class="EditColumn">
+
    <a href="https://subscribe.vonage.com/billing_and_shipping" class="BlueURL" id="edit_credit_card">Edit</a>
+
  </td>
+
</tr>
+
 
+
<tr>
+
 
+
  <td class="DescriptionColumn GreyBottomBorder">Billing Address </td>
+
  <td class="DetailsColumn GreyBottomBorder">10101 WINDHORST ROAD<br>TAMPA, FL 33619</td>
+
  <td class="EditColumn GreyBottomBorder">
+
    <a href="https://subscribe.vonage.com/billing_and_shipping" class="BlueURL" id="edit_billing_address">Edit</a>
+
  </td>
+
</tr>
+
 
+
 
+
 
+
 
+
 
+
 
+
 
+
</tbody></table>
+
 
+
 
+
 
+
    <div class="SummaryRightWhite">
+
  <table class="SummaryRightTable" border="0" cellpadding="0" cellspacing="0">
+
    <tbody><tr>
+
      <th colspan="3" class="OrangeText">
+
 
+
        First Month Charges
+
      </th>
+
    </tr>
+
   
+
   
+
   
+
    <tr id="USVONVW_0MF_VVV_RESPRM_invoice_row">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Vonage World Residential Plan:
+
  </td>
+
 
+
    <td class="monthlyDollarSign">$</td>
+
    <td class="monthlyInvoiceOrTaxItemAmount">
+
 
+
 
+
    24.99
+
      </td>
+
</tr>
+
 
+
   
+
    <tr id="V_KIOSK_REBATE_invoice_row">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Phone Adapter:
+
  </td>
+
 
+
 
+
   
+
       
+
        <td class="monthlyDollarSign">&nbsp;</td>
+
   
+
    <td id="V_KIOSK_REBATE_amount" class="monthlyInvoiceOrTaxItemAmountWide" colspan="1">
+
 
+
    0.00
+
      </td>
+
</tr>
+
 
+
   
+
    <tr id="VON_INSTALL_invoice_row">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Activation Fee:
+
  </td>
+
 
+
 
+
   
+
       
+
        <td class="monthlyDollarSign">&nbsp;</td>
+
   
+
    <td id="VON_INSTALL_amount" class="monthlyInvoiceOrTaxItemAmountWide" colspan="1">
+
 
+
    0.00
+
      </td>
+
</tr>
+
 
+
   
+
   
+
    <tr id="monthlyTaxesAndFeesHeader" onclick="">
+
      <td id="monthlyTaxesAndFeesLabel" class="Collapsed">
+
        <p>
+
          Taxes &amp; Fees
+
        </p>
+
      </td>
+
 
+
      <td class="monthlyInvoiceOrTaxItemAmountWide" colspan="2">
+
        <span id="monthlyTaxesAndFeesTotal">
+
        </span>
+
      </td>
+
    </tr>
+
   
+
    <tr id="161_1_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    State E911 Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
 
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    0.50
+
  </td>
+
</tr>
+
<tr id="64_4_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    County Communications Service Tax:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    1.39
+
  </td>
+
 
+
</tr>
+
<tr id="162_0_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Federal Program Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    2.24
+
  </td>
+
</tr>
+
<tr id="1_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
 
+
    Sales Tax:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    0.00
+
  </td>
+
</tr>
+
<tr id="USF_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Regulatory and Compliance Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
 
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    1.49
+
  </td>
+
</tr>
+
<tr id="911RRF_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    Emergency 911 Service Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    1.49
+
  </td>
+
 
+
</tr>
+
<tr id="14_1_tax_row" class="monthlyTaxItem" style="">
+
  <td class="monthlyInvoiceOrTaxItemLabel">
+
    State Gross Receipts Surcharge:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="monthlyInvoiceOrTaxItemAmount">
+
    0.72
+
  </td>
+
</tr>
+
 
+
   
+
    <tr class="TotalLine">
+
 
+
      <td class="monthlyInvoiceOrTaxItemLabel">
+
        Total Charges Today:
+
      </td>
+
      <td class="monthlyDollarSign">$</td>
+
      <td id="firstMonthTotalAmount" class="monthlyInvoiceOrTaxItemAmount" colspan="2">
+
          32.82
+
      </td>
+
    </tr>
+
+
+
 
+
  </tbody></table>
+
 
+
</div>
+
 
+
 
+
<div class="SummaryRightWhite">
+
  <table class="SummaryRightTable" border="0" cellpadding="0" cellspacing="0">
+
   
+
    <tbody><tr>
+
      <th colspan="3" class="OrangeText">Monthly Recurring Charges</th>
+
    </tr>
+
   
+
    <tr id="USVONVW_0MF_VVV_RESPRM_invoice_row">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
 
+
    Vonage World Residential Plan:
+
  </td>
+
 
+
    <td class="recurringDollarSign">$</td>
+
    <td class="recurringInvoiceOrTaxItemAmount">
+
 
+
    24.99
+
      </td>
+
</tr>
+
 
+
   
+
    <tr id="recurringTaxesAndFeesHeader" onclick="">
+
      <td id="recurringTaxesAndFeesLabel" class="Collapsed">
+
 
+
        <p>
+
          Estimated Taxes &amp; Fees
+
        </p>
+
      </td>
+
      <td class="monthlyDollarSign">&nbsp;</td>
+
      <td class="recurringInvoiceOrTaxItemAmountWide">
+
        <span id="recurringTaxesAndFeesTotal">
+
        </span>
+
 
+
      </td>
+
    </tr>
+
   
+
    <tr id="162_0_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
    Federal Program Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    2.24
+
  </td>
+
 
+
</tr>
+
<tr id="64_4_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
    County Communications Service Tax:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    1.39
+
  </td>
+
</tr>
+
<tr id="14_1_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
 
+
    State Gross Receipts Surcharge:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    0.72
+
  </td>
+
</tr>
+
<tr id="161_1_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
    State E911 Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
 
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    0.50
+
  </td>
+
</tr>
+
<tr id="USF_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
    Regulatory and Compliance Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    1.49
+
  </td>
+
 
+
</tr>
+
<tr id="911RRF_tax_row" class="recurringTaxItem" style="">
+
  <td class="recurringInvoiceOrTaxItemLabel">
+
    Emergency 911 Service Fee:
+
  </td>
+
<td class="monthlyDollarSign">&nbsp;</td>
+
  <td class="recurringInvoiceOrTaxItemAmount">
+
    1.49
+
  </td>
+
</tr>
+
 
+
   
+
    <tr class="TotalLine">
+
 
+
      <td class="recurringInvoiceOrTaxItemLabel">
+
        Total Monthly Charges:
+
      </td>
+
      <td class="recurringDollarSign">$</td>
+
      <td class="recurringInvoiceOrTaxItemAmount">
+
          32.82
+
      </td>
+
    </tr>
+
 
+
  </tbody></table>
+
 
+
</div>
+
 
+
 
+
<div id="SummaryRightColumnNote">
+
 
+
      <p>Estimated Taxes and Fees do not include monthly recurring sales tax.</p>
+

Revision as of 20:24, 18 March 2014

Related content:

Implementations

Java

package de.silpion.senior.email.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

/**
 * Convert text/html into text/plain
 *
 * @author Krishna Singhania
 * 
 * @version 1.2 $Date: July 31, 2008 $
 */
public class HTML2Text {

    boolean body_found = false;
    boolean in_body = false;
    boolean center = false;
    boolean pre = false;
    String href = "";

    public String convert(String source) throws Exception {
        StringBuffer result = new StringBuffer();
        StringBuffer result2 = new StringBuffer();
        StringReader input = new StringReader(source);

        try {
            String text = null;
            int c = input.read();

            while (c != -1) // Convert until EOF
            {
                text = "";
                if (c == '<') // It's a tag!!
                {
                    String CurrentTag = getTag(input); // Get the rest of the tag
                    text = convertTag(CurrentTag);
                } else if (c == '&') {
                    String specialchar = getSpecial(input);
                    if (specialchar.equals("lt;") || specialchar.equals("#60")) {
                        text = "<";
                    } else if (specialchar.equals("gt;") || specialchar.equals("#62")) {
                        text = ">";
                    } else if (specialchar.equals("amp;") || specialchar.equals("#38")) {
                        text = "&";
                    } else if (specialchar.equals("nbsp;")) {
                        text = " ";
                    } else if (specialchar.equals("quot;") || specialchar.equals("#34")) {
                        text = "\"";
                    } else if (specialchar.equals("copy;") || specialchar.equals("#169")) {
                        text = "[Copyright]";
                    } else if (specialchar.equals("reg;") || specialchar.equals("#174")) {
                        text = "[Registered]";
                    } else if (specialchar.equals("trade;") || specialchar.equals("#153")) {
                        text = "[Trademark]";
                    } else {
                        text = "&" + specialchar;
                    }
                } else if (!pre && Character.isWhitespace((char) c)) {
                    StringBuffer s = in_body ? result : result2;
                    if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length() - 1))) {
                        text = "";
                    } else {
                        text = " ";
                    }
                } else {
                    text = "" + (char) c;
                }

                StringBuffer s = in_body ? result : result2;
                s.append(text);

                c = input.read();
            }
        } catch (Exception e) {
            input.close();
            throw e;
        }

        StringBuffer s = body_found ? result : result2;
        return s.toString().trim();
    }

    String getTag(Reader r) throws IOException {
        StringBuffer result = new StringBuffer();
        int level = 1;

        result.append('<');
        while (level > 0) {
            int c = r.read();
            if (c == -1) {
                break; // EOF
            }
            result.append((char) c);
            if (c == '<') {
                level++;
            } else if (c == '>') {
                level--;
            }
        }

        return result.toString();
    }

    String getSpecial(Reader r) throws IOException {
        StringBuffer result = new StringBuffer();
        r.mark(1); //Mark the present position in the stream
        int c = r.read();

        while (Character.isLetter((char) c)) {
            result.append((char) c);
            r.mark(1);
            c = r.read();
        }

        if (c == ';') {
            result.append(';');
        } else {
            r.reset();
        }
        return result.toString();
    }

    boolean isTag(String s1, String s2) {
        s1 = s1.toLowerCase();
        String t1 = "<" + s2.toLowerCase() + ">";
        String t2 = "<" + s2.toLowerCase() + " ";

        return s1.startsWith(t1) || s1.startsWith(t2);
    }

    String convertTag(String t) throws IOException {
        String result = "";

        if (isTag(t, "body")) {
            in_body = true;
            body_found = true;
        } else if (isTag(t, "/body")) {
            in_body = false;
            result = "";
        } else if (isTag(t, "center")) {
            result = "";
            center = true;
        } else if (isTag(t, "/center")) {
            result = "";
            center = false;
        } else if (isTag(t, "pre")) {
            result = "";
            pre = true;
        } else if (isTag(t, "/pre")) {
            result = "";
            pre = false;
        } else if (isTag(t, "p")) {
            result = "";
        } else if (isTag(t, "br")) {
            result = "";
        } else if (isTag(t, "h1") || isTag(t, "h2") || isTag(t, "h3") || isTag(t, "h4") || isTag(t, "h5") || isTag(t, "h6") || isTag(t, "h7")) {
            result = "";
        } else if (isTag(t, "/h1") || isTag(t, "/h2") || isTag(t, "/h3") || isTag(t, "/h4") || isTag(t, "/h5") || isTag(t, "/h6") || isTag(t, "/h7")) {
            result = "";
        } else if (isTag(t, "/dl")) {
            result = "";
        } else if (isTag(t, "dd")) {
            result = "  * ";
        } else if (isTag(t, "dt")) {
            result = "      ";
        } else if (isTag(t, "li")) {
            result = "  * ";
        } else if (isTag(t, "/ul")) {
            result = "";
        } else if (isTag(t, "/ol")) {
            result = "";
        } else if (isTag(t, "hr")) {
            result = "_________________________________________";
        } else if (isTag(t, "table")) {
            result = "";
        } else if (isTag(t, "/table")) {
            result = "";
        } else if (isTag(t, "form")) {
            result = "";
        } else if (isTag(t, "/form")) {
            result = "";
        } else if (isTag(t, "b")) {
            result = "*";
        } else if (isTag(t, "/b")) {
            result = "*";
        } else if (isTag(t, "i")) {
            result = "\"";
        } else if (isTag(t, "/i")) {
            result = "\"";
        } else if (isTag(t, "img")) {
            int idx = t.indexOf("alt=\"");
            if (idx != -1) {
                idx += 5;
                int idx2 = t.indexOf("/\"", idx);
                result = t.substring(idx, idx2);
            }
        } else if (isTag(t, "a")) {
            int idx = t.indexOf("href=\"");
            if (idx != -1) {
                idx += 6;
                int idx2 = t.indexOf("/\"", idx);
                href = t.substring(idx, idx2);
            } else {
                href = "";
            }
        } else if (isTag(t, "/a")) {
            if (href.length() > 0) {
                result = " [ " + href + " ]";
                href = "";
            }
        }

        return result;
    }

    public static void main(String argv[]) throws Exception {
        FileInputStream fis = null;
        String s = null;

        try {
            File file;
            if (argv[0] != null) {
                file = new File(argv[0]);
            } else {
                file = new File("html_test_file.html");
            }
            fis = new FileInputStream(file);
            byte buf[] = new byte[fis.available()];
            //bytes that can be read from this file input stream without blocking

            fis.read(buf);
            fis.close();
            fis = null;
            s = new String(buf);
            HTML2Text h = new HTML2Text();
            System.out.println(h.convert(s));
        } catch (Exception e) {
            if (fis != null) {
                fis.close();
            }
            throw e;
        }
    }
}

Original Source: [1] Original Author: Rockey Mandy

Perl

<HIGHLIGHTSYNTAX language="perl"> use HTML::TreeBuilder qw(); use HTML::FormatText qw(); print HTML::FormatText->new->format(

   HTML::TreeBuilder->new->parse_file('foo.html')

); </HIGHLIGHTSYNTAX>

Another similar page is Unescape HTML special characters from a String.