Võ Văn Hải's blog

Chỉ có một điều tuyệt đối đó là mọi thứ đều tương đối…

Chuyển Html sang XML

Sau đây là lớp với các phương thức để chuyển các url/file/text dạng HTML sang XML.

Bài viết tham khảo từ  trang http://sourceforge.net/projects/light-html2xml sau đó thêm các phương thức utils.

Chúc các bạn thành công!

package vovanhai.wordpress.com;

import java.io.BufferedReader;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.InputStream;

import java.io.PrintWriter;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Scanner;

import java.util.Stack;

public class Html2Xml {

enum states {text, tag, endtag, attrtext, script, endscript, specialtag, comment, skipcdata, entity, namedentity, numericentity, hexaentity, tillgt, tillquote, tillinst, andgt};

private static HashMap<String, Integer> namedentities = new HashMap<String, Integer>();

private static List<String> emptytags = new ArrayList<String>();

private static HashMap<String, List<String>> autoclosetags = new HashMap<String, List<String>>();

private String Convert2XML(String s) {

namedentities.put(“AElig”, 198);

namedentities.put(“Aacute”, 193);

namedentities.put(“Acirc”, 194);

namedentities.put(“Agrave”, 192);

namedentities.put(“Alpha”, 913);

namedentities.put(“Aring”, 197);

namedentities.put(“Atilde”, 195);

namedentities.put(“Auml”, 196);

namedentities.put(“Beta”, 914);

namedentities.put(“Ccedil”, 199);

namedentities.put(“Chi”, 935);

namedentities.put(“Dagger”, 8225);

namedentities.put(“Delta”, 916);

namedentities.put(“ETH”, 208);

namedentities.put(“Eacute”, 201);

namedentities.put(“Ecirc”, 202);

namedentities.put(“Egrave”, 200);

namedentities.put(“Epsilon”, 917);

namedentities.put(“Eta”, 919);

namedentities.put(“Euml”, 203);

namedentities.put(“Gamma”, 915);

namedentities.put(“Iacute”, 205);

namedentities.put(“Icirc”, 206);

namedentities.put(“Igrave”, 204);

namedentities.put(“Iota”, 921);

namedentities.put(“Iuml”, 207);

namedentities.put(“Kappa”, 922);

namedentities.put(“Lambda”, 923);

namedentities.put(“Mu”, 924);

namedentities.put(“Ntilde”, 209);

namedentities.put(“Nu”, 925);

namedentities.put(“OElig”, 338);

namedentities.put(“Oacute”, 211);

namedentities.put(“Ocirc”, 212);

namedentities.put(“Ograve”, 210);

namedentities.put(“Omega”, 937);

namedentities.put(“Omicron”, 927);

namedentities.put(“Oslash”, 216);

namedentities.put(“Otilde”, 213);

namedentities.put(“Ouml”, 214);

namedentities.put(“Phi”, 934);

namedentities.put(“Pi”, 928);

namedentities.put(“Prime”, 8243);

namedentities.put(“Psi”, 936);

namedentities.put(“Rho”, 929);

namedentities.put(“Scaron”, 352);

namedentities.put(“Sigma”, 931);

namedentities.put(“THORN”, 222);

namedentities.put(“Tau”, 932);

namedentities.put(“Theta”, 920);

namedentities.put(“Uacute”, 218);

namedentities.put(“Ucirc”, 219);

namedentities.put(“Ugrave”, 217);

namedentities.put(“Upsilon”, 933);

namedentities.put(“Uuml”, 220);

namedentities.put(“Xi”, 926);

namedentities.put(“Yacute”, 221);

namedentities.put(“Yuml”, 376);

namedentities.put(“Zeta”, 918);

namedentities.put(“aacute”, 225);

namedentities.put(“acirc”, 226);

namedentities.put(“acute”, 180);

namedentities.put(“aelig”, 230);

namedentities.put(“agrave”, 224);

namedentities.put(“alpha”, 945);

namedentities.put(“and”, 8743);

namedentities.put(“ang”, 8736);

namedentities.put(“aring”, 229);

namedentities.put(“asymp”, 8776);

namedentities.put(“atilde”, 227);

namedentities.put(“auml”, 228);

namedentities.put(“bdquo”, 8222);

namedentities.put(“beta”, 946);

namedentities.put(“brvbar”, 166);

namedentities.put(“bull”, 8226);

namedentities.put(“cap”, 8745);

namedentities.put(“ccedil”, 231);

namedentities.put(“cedil”, 184);

namedentities.put(“cent”, 162);

namedentities.put(“chi”, 967);

namedentities.put(“circ”, 710);

namedentities.put(“clubs”, 9827);

namedentities.put(“cong”, 8773);

namedentities.put(“copy”, 169);

namedentities.put(“crarr”, 8629);

namedentities.put(“cup”, 8746);

namedentities.put(“curren”, 164);

namedentities.put(“dagger”, 8224);

namedentities.put(“darr”, 8595);

namedentities.put(“deg”, 176);

namedentities.put(“delta”, 948);

namedentities.put(“diams”, 9830);

namedentities.put(“divide”, 247);

namedentities.put(“eacute”, 233);

namedentities.put(“ecirc”, 234);

namedentities.put(“egrave”, 232);

namedentities.put(“empty”, 8709);

namedentities.put(“emsp”, 8195);

namedentities.put(“ensp”, 8194);

namedentities.put(“epsilon”, 949);

namedentities.put(“equiv”, 8801);

namedentities.put(“eta”, 951);

namedentities.put(“eth”, 240);

namedentities.put(“euml”, 235);

namedentities.put(“euro”, 8364);

namedentities.put(“exists”, 8707);

namedentities.put(“fnof”, 402);

namedentities.put(“forall”, 8704);

namedentities.put(“frac12”, 189);

namedentities.put(“frac14”, 188);

namedentities.put(“frac34”, 190);

namedentities.put(“gamma”, 947);

namedentities.put(“ge”, 8805);

namedentities.put(“harr”, 8596);

namedentities.put(“hearts”, 9829);

namedentities.put(“hellip”, 8230);

namedentities.put(“iacute”, 237);

namedentities.put(“icirc”, 238);

namedentities.put(“iexcl”, 161);

namedentities.put(“igrave”, 236);

namedentities.put(“infin”, 8734);

namedentities.put(“int”, 8747);

namedentities.put(“iota”, 953);

namedentities.put(“iquest”, 191);

namedentities.put(“isin”, 8712);

namedentities.put(“iuml”, 239);

namedentities.put(“kappa”, 954);

namedentities.put(“lambda”, 923);

namedentities.put(“laquo”, 171);

namedentities.put(“larr”, 8592);

namedentities.put(“lceil”, 8968);

namedentities.put(“ldquo”, 8220);

namedentities.put(“le”, 8804);

namedentities.put(“lfloor”, 8970);

namedentities.put(“lowast”, 8727);

namedentities.put(“loz”, 9674);

namedentities.put(“lrm”, 8206);

namedentities.put(“lsaquo”, 8249);

namedentities.put(“lsquo”, 8216);

namedentities.put(“macr”, 175);

namedentities.put(“mdash”, 8212);

namedentities.put(“micro”, 181);

namedentities.put(“middot”, 183);

namedentities.put(“minus”, 8722);

namedentities.put(“mu”, 956);

namedentities.put(“nabla”, 8711);

namedentities.put(“nbsp”, 160);

namedentities.put(“ndash”, 8211);

namedentities.put(“ne”, 8800);

namedentities.put(“ni”, 8715);

namedentities.put(“not”, 172);

namedentities.put(“notin”, 8713);

namedentities.put(“nsub”, 8836);

namedentities.put(“ntilde”, 241);

namedentities.put(“nu”, 925);

namedentities.put(“oacute”, 243);

namedentities.put(“ocirc”, 244);

namedentities.put(“oelig”, 339);

namedentities.put(“ograve”, 242);

namedentities.put(“oline”, 8254);

namedentities.put(“omega”, 969);

namedentities.put(“omicron”, 959);

namedentities.put(“oplus”, 8853);

namedentities.put(“or”, 8744);

namedentities.put(“ordf”, 170);

namedentities.put(“ordm”, 186);

namedentities.put(“oslash”, 248);

namedentities.put(“otilde”, 245);

namedentities.put(“otimes”, 8855);

namedentities.put(“ouml”, 246);

namedentities.put(“para”, 182);

namedentities.put(“part”, 8706);

namedentities.put(“permil”, 8240);

namedentities.put(“perp”, 8869);

namedentities.put(“phi”, 966);

namedentities.put(“pi”, 960);

namedentities.put(“piv”, 982);

namedentities.put(“plusmn”, 177);

namedentities.put(“pound”, 163);

namedentities.put(“prime”, 8242);

namedentities.put(“prod”, 8719);

namedentities.put(“prop”, 8733);

namedentities.put(“psi”, 968);

namedentities.put(“radic”, 8730);

namedentities.put(“raquo”, 187);

namedentities.put(“rarr”, 8594);

namedentities.put(“rceil”, 8969);

namedentities.put(“rdquo”, 8221);

namedentities.put(“reg”, 174);

namedentities.put(“rfloor”, 8971);

namedentities.put(“rho”, 961);

namedentities.put(“rlm”, 8207);

namedentities.put(“rsaquo”, 8250);

namedentities.put(“rsquo”, 8217);

namedentities.put(“sbquo”, 8218);

namedentities.put(“scaron”, 353);

namedentities.put(“sdot”, 8901);

namedentities.put(“sect”, 167);

namedentities.put(“shy”, 173);

namedentities.put(“sigma”, 963);

namedentities.put(“sigmaf”, 962);

namedentities.put(“sim”, 8764);

namedentities.put(“spades”, 9824);

namedentities.put(“sub”, 8834);

namedentities.put(“sube”, 8838);

namedentities.put(“sum”, 8721);

namedentities.put(“sup”, 8835);

namedentities.put(“sup1”, 185);

namedentities.put(“sup3”, 179);

namedentities.put(“supe”, 8839);

namedentities.put(“szlig”, 223);

namedentities.put(“tau”, 964);

namedentities.put(“there4”, 8756);

namedentities.put(“theta”, 952);

namedentities.put(“thetasym”, 977);

namedentities.put(“thinsp”, 8201);

namedentities.put(“thorn”, 254);

namedentities.put(“tilde”, 732);

namedentities.put(“times”, 215);

namedentities.put(“trade”, 8482);

namedentities.put(“uacute”, 250);

namedentities.put(“uarr”, 8593);

namedentities.put(“ucirc”, 251);

namedentities.put(“ugrave”, 249);

namedentities.put(“uml”, 168);

namedentities.put(“up2”, 178);

namedentities.put(“upsih”, 978);

namedentities.put(“upsilon”, 965);

namedentities.put(“uuml”, 252);

namedentities.put(“xi”, 958);

namedentities.put(“yacute”, 253);

namedentities.put(“yen”, 165);

namedentities.put(“yuml”, 255);

namedentities.put(“zeta”, 950);

namedentities.put(“zwj”, 8205);

namedentities.put(“zwnj”, 8204);

emptytags.add(“area”);

emptytags.add(“base”);

emptytags.add(“basefont”);

emptytags.add(“br”);

emptytags.add(“col”);

emptytags.add(“frame”);

emptytags.add(“hr”);

emptytags.add(“img”);

emptytags.add(“input”);

emptytags.add(“isindex”);

emptytags.add(“link”);

emptytags.add(“meta”);

emptytags.add(“param”);

autoclosetags.put(“basefont”, new ArrayList<String>());

autoclosetags.get(“basefont”).add(“basefont”);

autoclosetags.put(“colgroup”, new ArrayList<String>());

autoclosetags.get(“colgroup”).add(“colgroup”);

autoclosetags.put(“dd”, new ArrayList<String>());

autoclosetags.get(“dd”).add(“colgroup”);

autoclosetags.put(“dt”, new ArrayList<String>());

autoclosetags.get(“dt”).add(“dt”);

autoclosetags.put(“li”, new ArrayList<String>());

autoclosetags.get(“li”).add(“li”);

autoclosetags.put(“p”, new ArrayList<String>());

autoclosetags.get(“p”).add(“p”);

autoclosetags.put(“thead”, new ArrayList<String>());

autoclosetags.get(“thead”).add(“tbody”);

autoclosetags.get(“thead”).add(“tfoot”);

autoclosetags.put(“tbody”, new ArrayList<String>());

autoclosetags.get(“tbody”).add(“thead”);

autoclosetags.get(“tbody”).add(“tfoot”);

autoclosetags.put(“tfoot”, new ArrayList<String>());

autoclosetags.get(“tfoot”).add(“thead”);

autoclosetags.get(“tfoot”).add(“tbody”);

autoclosetags.put(“th”, new ArrayList<String>());

autoclosetags.get(“th”).add(“td”);

autoclosetags.put(“td”, new ArrayList<String>());

autoclosetags.get(“td”).add(“th”);

autoclosetags.get(“td”).add(“td”);

autoclosetags.put(“tr”, new ArrayList<String>());

autoclosetags.get(“tr”).add(“tr”);

String r2 = “”;

String r = “”;

int limit = s.length();

states state = states.text;

states prevstate = state;

Stack<String> opentags = new Stack<String>();

String name = “”;

String tagname = “”;

String attrname = “”;

String attrs = “”;

List<String> attrnames = new ArrayList<String>();

int entvalue = 0;

char attrdelim = ‘”‘;

String attrvalue = “”;

String cs = “”;

char prec = ‘ ‘;

char preprec = ‘ ‘;

char c = ‘ ‘;

int start = 0;

String encoding = “”;

if (s.charAt(0) == 0xEF && s.charAt(1) == 0xBB && s.charAt(2)== 0xBF) {

encoding = “utf-8”;

start = 3;

} else {

encoding = “iso-8859-1”;

start = 0;

}

for (int i = start; i < limit && ((r2.equals(“”) && r.equals(“”)) || !opentags.empty()); i++)  {

if (r.length() > 10240)  {

r2 += r;

r = “”;

}

c = s.charAt(i);

switch (state) {

case text:

if (c == ‘<‘)  {

name = “”;

tagname = “”;

attrname = “”;

attrs = “”;

attrnames.clear();

state = states.tag;

break;

}

if (!Character.isWhitespace(c) && opentags.empty())  {

r += “<html>”;

opentags.push(“html”);

}

if (Character.isWhitespace(c) && opentags.empty())  {

break;

}

if (c == ‘&’)  {

name = “”;

entvalue = 0;

prevstate = state;

state = states.entity;

break;

}

r += c;

break;

case tag:

if (c == ‘?’ && tagname.equals(“”)) {

state = states.tillinst;

break;

}

if (c == ‘!’ && tagname.equals(“”)) {

state = states.specialtag;

prec = ‘ ‘;

break;

}

if (c == ‘/’ && name.equals(“”) && tagname.equals(“”)) {

state = states.endtag;

name = “”;

break;

}

if (Character.isWhitespace(c))  {

if (name.equals(“”))  {

break;

}

if (tagname.equals(“”) && name != “_”)  {

tagname = name;

name = “”;

break;

}

if (attrname.equals(“”)) {

attrname = name.toLowerCase();

name = “”;

break;

}

break;

}

if (c == ‘=’)  {

if (attrname.equals(“”)) {

attrname = name.toLowerCase();

name = “”;

}

state = states.tillquote;

break;

}

if (c == ‘/’ && (!tagname.equals(“”) || !name.equals(“”))) {

if (tagname.equals(“”)) {

tagname = name;

}

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag))  {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) {

r += “<tbody>”;

opentags.push(“tbody”);

}

r += “<” + tagname + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’)  {

if (tagname.equals(“”) && !name.equals(“”)) {

tagname = name;

}

if (!tagname.equals(“”)) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty())  {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag))  {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) {

r += “<tbody>”;

opentags.push(“tbody”);

}

if (emptytags.contains(tagname)) {

r += “<” + tagname.toLowerCase() + attrs + “/>”;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

state = states.script;

break;

}

}

state = states.text;

break;

}

}

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrname.equals(“”) && !attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrs += ” ” + attrname + “=\”” + attrname + “\””;

attrname = “”;

}

cs = “” + c;

name += (Character.isLetterOrDigit(c) && name != “”) || Character.isLetter(c) ? cs : (name.equals(“”) ? “_” : (c == ‘-‘ ? “-” : (!name.equals(“_”) ? “_” : “”)));

break;

case endtag:

if (c == ‘>’) {

name = name.toLowerCase();

if (opentags.search(name) != –1) {

String prevtag;

while (!(prevtag = opentags.pop()).equals(name)) {

r += “</” + prevtag + “>”;

}

r += “</” + name + “>”;

} else {

if (!name.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

}

state = states.text;

break;

}

if (Character.isWhitespace(c)) {

break;

}

cs = “” + c;

name += Character.isLetterOrDigit(c) ? cs : !name.equals(“_”) ? “_” : “”;

break;

case attrtext:

if (c == attrdelim || (Character.isWhitespace(c) && attrdelim == ‘ ‘)) {

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

state = states.tag;

break;

}

if (attrdelim == ‘ ‘ && (c == ‘/’ || c == ‘>’)) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty()) {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag)) {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

if (c == ‘/’) {

r += “<” + tagname + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’) {

if (emptytags.contains(tagname)) {

r += “<” + tagname + attrs + “/>”;

state = states.text;

break;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

prec = ‘ ‘;

preprec = ‘ ‘;

state = states.script;

break;

}

state = states.text;

break;

}

}

}

if (c == ‘&’) {

name = “”;

entvalue = 0;

prevstate = state;

state = states.entity;

break;

}

cs = “” + c;

attrvalue += c == ‘”‘ ? “&quot;” : c == ‘\” ? “&apos;” : cs;

break;

case script:

if (c == ‘/’ && prec == ‘<‘) {

state = states.endscript;

name = “”;

break;

}

if (c == ‘[‘ && prec == ‘!’ && preprec == ‘<‘) {

state = states.skipcdata;

name = “<![“;

break;

}

if (c == ‘>’ && prec == ‘]’ && preprec == ‘]’) {

c = r.charAt(r.length() 3);

r = r.substring(0, r.length() 4);

}

r += c;

preprec = prec;

prec = c;

break;

case endscript:

if (c == ‘>’ && name.toLowerCase().equals(“script”)) {

r = r.substring(0, r.length() 1);

r += “]]></script>”;

state = states.text;

break;

}

name += c;

String sscr = “script”;

if (!sscr.startsWith(name.toLowerCase())) {

r += name;

state = states.script;

}

break;

case specialtag:

if (c != ‘-‘) {

state = states.tillgt;

break;

}

if (prec == ‘-‘) {

state = states.comment;

preprec = ‘ ‘;

break;

}

prec = c;

break;

case comment:

if (c == ‘>’ && prec == ‘-‘ && preprec == ‘-‘) {

state = states.text;

break;

}

preprec = prec;

prec = c;

break;

case skipcdata:

if (name.equals(“<![CDATA[“)) {

state = states.script;

break;

}

name += c;

String scdata = “<![CDATA[“;

if (!scdata.startsWith(name)) {

r += name;

state = states.script;

}

break;

case entity:

if (c == ‘#’) {

state = states.numericentity;

break;

}

name += c;

state = states.namedentity;

break;

case numericentity:

if (c == ‘x’ || c == ‘X’) {

state = states.hexaentity;

break;

}

if (c == ‘;’) {

String ent = “&#” + entvalue + “;”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

entvalue = entvalue * 10 + c – ‘0’;

break;

case hexaentity:

if (c == ‘;’) {

String ent = “&#” + entvalue + “;”;

if (prevstate == states.text)  {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

entvalue = entvalue * 16 + (Character.isDigit(c) ? c – ‘0’ : Character.toUpperCase(c) ‘A’);

break;

case namedentity:

if (c == ‘;’) {

String ent;

name = name.toLowerCase();

if (name.equals(“amp”) || name.equals(“lt”) || name.equals(“gt”) || name.equals(“quot”) || name.equals(“apos”)) {

ent = “&” + name + “;”;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

if (namedentities.containsKey(name)) {

entvalue = namedentities.get(name);

} else {

entvalue = 0;

}

ent = “&#” + entvalue + “;”;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

break;

}

if (!Character.isLetterOrDigit(c) || name.length() > 6) {

String ent = “&amp;” + name;

name = “”;

if (prevstate == states.text) {

r += ent;

} else {

attrvalue += ent;

}

state = prevstate;

i–;

break;

}

name += c;

break;

case tillinst:

if (c == ‘?’) {

state = states.andgt;

}

break;

case andgt:

if (c == ‘>’) {

state = states.text;

break;

}

state = states.tillinst;

break;

case tillgt:

if (c == ‘>’) {

state = states.text;

}

break;

case tillquote:

if (Character.isWhitespace(c)) {

break;

}

if (c == ‘”‘ || c == ‘\”) {

attrdelim = c;

attrvalue = “”;

state = states.attrtext;

break;

}

if (c == ‘/’ || c == ‘>’) {

if (attrname.equals(“_”)) {

while(attrnames.contains(attrname)) {

attrname += “_”;

}

}

if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) {

attrnames.add(attrname);

attrs += ” ” + attrname + “=\”” + attrvalue + “\””;

}

attrname = “”;

}

if (c == ‘/’) {

r += “<” + tagname.toLowerCase() + attrs + “/>”;

state = states.tillgt;

break;

}

if (c == ‘>’) {

tagname = tagname.toLowerCase();

if (!tagname.equals(“html”) && opentags.empty()) {

r += “<html>”;

opentags.push(“html”);

}

if (autoclosetags.containsKey(tagname) && !opentags.empty())  {

String prevtag = opentags.peek();

if (autoclosetags.get(tagname).contains(prevtag)) {

opentags.pop();

r += “</” + prevtag + “>”;

}

}

if (emptytags.contains(tagname)) {

r += “<” + tagname + attrs + “/>”;

state = states.text;

break;

} else {

opentags.push(tagname);

r += “<” + tagname + attrs + “>”;

if (tagname.equals(“script”)) {

r += “<![CDATA[“;

opentags.pop();

state = states.script;

break;

}

}

}

attrdelim = ‘ ‘;

attrvalue = “” + c;

state = states.attrtext;

break;

}

}

while (!opentags.empty())  {

r += “</” + opentags.pop() + “>”;

}

r2 += r;

return “<?xml version=\”1.0\” encoding=\”” + encoding + “\”?>\n” + r2;

}

/**

* Ðọc 1 URL sau ðó chuyển thành XML rồi lưu xuống file

* @param address là ðịa chỉ cần ðọc

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public void URL2XML(String address,String xmlFileName) throws Exception{

URL url=new URL(address);

InputStream inStream = url.openStream();

Scanner in = new Scanner(inStream);

String s=“”;

while(in.hasNextLine()){

s+=in.nextLine()+ “\n”;

}

inStream.close();

in.close();

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(s));

pw.close();

}

/**

* Ðọc 1 file HTML trên ðĩa sau ðó biến thành xml file rồi lưu xuống ðĩa

* @param htmlFilePath là đường dẫn tuyệt đối đến file html

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public void HtmlFile2XML(String htmlFilePath,String xmlFileName) throws Exception{

FileReader fr = new FileReader(htmlFilePath);

BufferedReader br = new BufferedReader(fr);

String s = “”;

while (br.ready()) {

s += br.readLine() + “\n”;

}

br.close();

fr.close();

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(s));

pw.close();

}

/**

* Chuyển 1 chuỗi html ra dạng xml

* @param htmlString chuỗi chứa ðịnh dạng html

* @param xmlFileName là file xml chỉ ðịnh ðể lưu

* @throws Exception

*/

public String StringPattern2XML(String htmlString,String xmlFileName,boolean toFile) throws Exception{

if(!toFile)

return Convert2XML(htmlString);

if(!xmlFileName.endsWith(“.xml”))

xmlFileName+=“.xml”;

FileWriter fw  = new FileWriter(xmlFileName);

PrintWriter pw  = new PrintWriter(fw,true);

pw.print(Convert2XML(htmlString));

pw.close();

return “”;

}

/**

* Testing

* @param args

*/

public static void main (String[] args){

try {

Html2Xml h2x=new Html2Xml();

String add=http://www.bk4.com.vn&#8221;;

System.out.println(“starting convert. Please wait…”);

h2x.URL2XML(add, “xxxx.xml”);

System.out.println(“convert completed…”);

/*String s=”<html><body><body/></html>”;//<></>

String kq=h2x.StringPattern2XML(s, “”, false);

System.out.println(kq);*/

} catch (Exception e) {

e.printStackTrace();

}

}

}

Tài liệu tham khảo:

http://www.ibm.com/developerworks/web/library/x-html5xhtml2.html?S_TACT=105AGX08&S_CMP=EDU

6 Responses to “Chuyển Html sang XML”

  1. Anhkho said

    Vui lòng viết lại mã bằng C# đi bạn ui

  2. Anhkho said

    Xin lỗi đã lộn – Anh Hải Vui lòng Xó dùm

  3. ly hue can said

    cach hoc don gian de hieu nhat doi voi mon cong nghe phan mem la nhu the nao? trong mon cong nghe phan mem co nganh do hoa khong? trinh do toan va hoa hoi kem hoc tot mon nay ko?can phai lam gi de hoc tot mon cong nghe phan mem? mong thay giai dap nhung cau hoi tren dum em

  4. Võ Văn Hải said

    Bạn hỏi không đúng chủ đề rồi!
    Câu hỏi của bạn cũng không rõ ràng: môn công nghệ phần mềm là 1 môn học sao lại có ngành đồ họa? không hiểu!

  5. Ngọc Hường said

    Thầy ơi! Cái file xml vừa được chuyển từ url = “http://dantri.com.vn/”. bị lỗi font rồi Thầy ơi!

  6. Ngọc Hường said

    Thầy ơi! em chuyển font lại được gồi. Đã phiền Thầy

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

 
%d bloggers like this: