Chuyển Html sang XML
Sau đây là lớp với các phương thức để chuyển các url/file/text dạng HTML sang XML.
Bài viết tham khảo từ trang http://sourceforge.net/projects/light-html2xml sau đó thêm các phương thức utils.
Chúc các bạn thành công!
package vovanhai.wordpress.com; import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Scanner; import java.util.Stack; public class Html2Xml { enum states {text, tag, endtag, attrtext, script, endscript, specialtag, comment, skipcdata, entity, namedentity, numericentity, hexaentity, tillgt, tillquote, tillinst, andgt}; private static HashMap<String, Integer> namedentities = new HashMap<String, Integer>(); private static List<String> emptytags = new ArrayList<String>(); private static HashMap<String, List<String>> autoclosetags = new HashMap<String, List<String>>(); private String Convert2XML(String s) { namedentities.put(“AElig”, 198); namedentities.put(“Aacute”, 193); namedentities.put(“Acirc”, 194); namedentities.put(“Agrave”, 192); namedentities.put(“Alpha”, 913); namedentities.put(“Aring”, 197); namedentities.put(“Atilde”, 195); namedentities.put(“Auml”, 196); namedentities.put(“Beta”, 914); namedentities.put(“Ccedil”, 199); namedentities.put(“Chi”, 935); namedentities.put(“Dagger”, 8225); namedentities.put(“Delta”, 916); namedentities.put(“ETH”, 208); namedentities.put(“Eacute”, 201); namedentities.put(“Ecirc”, 202); namedentities.put(“Egrave”, 200); namedentities.put(“Epsilon”, 917); namedentities.put(“Eta”, 919); namedentities.put(“Euml”, 203); namedentities.put(“Gamma”, 915); namedentities.put(“Iacute”, 205); namedentities.put(“Icirc”, 206); namedentities.put(“Igrave”, 204); namedentities.put(“Iota”, 921); namedentities.put(“Iuml”, 207); namedentities.put(“Kappa”, 922); namedentities.put(“Lambda”, 923); namedentities.put(“Mu”, 924); namedentities.put(“Ntilde”, 209); namedentities.put(“Nu”, 925); namedentities.put(“OElig”, 338); namedentities.put(“Oacute”, 211); namedentities.put(“Ocirc”, 212); namedentities.put(“Ograve”, 210); namedentities.put(“Omega”, 937); namedentities.put(“Omicron”, 927); namedentities.put(“Oslash”, 216); namedentities.put(“Otilde”, 213); namedentities.put(“Ouml”, 214); namedentities.put(“Phi”, 934); namedentities.put(“Pi”, 928); namedentities.put(“Prime”, 8243); namedentities.put(“Psi”, 936); namedentities.put(“Rho”, 929); namedentities.put(“Scaron”, 352); namedentities.put(“Sigma”, 931); namedentities.put(“THORN”, 222); namedentities.put(“Tau”, 932); namedentities.put(“Theta”, 920); namedentities.put(“Uacute”, 218); namedentities.put(“Ucirc”, 219); namedentities.put(“Ugrave”, 217); namedentities.put(“Upsilon”, 933); namedentities.put(“Uuml”, 220); namedentities.put(“Xi”, 926); namedentities.put(“Yacute”, 221); namedentities.put(“Yuml”, 376); namedentities.put(“Zeta”, 918); namedentities.put(“aacute”, 225); namedentities.put(“acirc”, 226); namedentities.put(“acute”, 180); namedentities.put(“aelig”, 230); namedentities.put(“agrave”, 224); namedentities.put(“alpha”, 945); namedentities.put(“and”, 8743); namedentities.put(“ang”, 8736); namedentities.put(“aring”, 229); namedentities.put(“asymp”, 8776); namedentities.put(“atilde”, 227); namedentities.put(“auml”, 228); namedentities.put(“bdquo”, 8222); namedentities.put(“beta”, 946); namedentities.put(“brvbar”, 166); namedentities.put(“bull”, 8226); namedentities.put(“cap”, 8745); namedentities.put(“ccedil”, 231); namedentities.put(“cedil”, 184); namedentities.put(“cent”, 162); namedentities.put(“chi”, 967); namedentities.put(“circ”, 710); namedentities.put(“clubs”, 9827); namedentities.put(“cong”, 8773); namedentities.put(“copy”, 169); namedentities.put(“crarr”, 8629); namedentities.put(“cup”, 8746); namedentities.put(“curren”, 164); namedentities.put(“dagger”, 8224); namedentities.put(“darr”, 8595); namedentities.put(“deg”, 176); namedentities.put(“delta”, 948); namedentities.put(“diams”, 9830); namedentities.put(“divide”, 247); namedentities.put(“eacute”, 233); namedentities.put(“ecirc”, 234); namedentities.put(“egrave”, 232); namedentities.put(“empty”, 8709); namedentities.put(“emsp”, 8195); namedentities.put(“ensp”, 8194); namedentities.put(“epsilon”, 949); namedentities.put(“equiv”, 8801); namedentities.put(“eta”, 951); namedentities.put(“eth”, 240); namedentities.put(“euml”, 235); namedentities.put(“euro”, 8364); namedentities.put(“exists”, 8707); namedentities.put(“fnof”, 402); namedentities.put(“forall”, 8704); namedentities.put(“frac12”, 189); namedentities.put(“frac14”, 188); namedentities.put(“frac34”, 190); namedentities.put(“gamma”, 947); namedentities.put(“ge”, 8805); namedentities.put(“harr”, 8596); namedentities.put(“hearts”, 9829); namedentities.put(“hellip”, 8230); namedentities.put(“iacute”, 237); namedentities.put(“icirc”, 238); namedentities.put(“iexcl”, 161); namedentities.put(“igrave”, 236); namedentities.put(“infin”, 8734); namedentities.put(“int”, 8747); namedentities.put(“iota”, 953); namedentities.put(“iquest”, 191); namedentities.put(“isin”, 8712); namedentities.put(“iuml”, 239); namedentities.put(“kappa”, 954); namedentities.put(“lambda”, 923); namedentities.put(“laquo”, 171); namedentities.put(“larr”, 8592); namedentities.put(“lceil”, 8968); namedentities.put(“ldquo”, 8220); namedentities.put(“le”, 8804); namedentities.put(“lfloor”, 8970); namedentities.put(“lowast”, 8727); namedentities.put(“loz”, 9674); namedentities.put(“lrm”, 8206); namedentities.put(“lsaquo”, 8249); namedentities.put(“lsquo”, 8216); namedentities.put(“macr”, 175); namedentities.put(“mdash”, 8212); namedentities.put(“micro”, 181); namedentities.put(“middot”, 183); namedentities.put(“minus”, 8722); namedentities.put(“mu”, 956); namedentities.put(“nabla”, 8711); namedentities.put(“nbsp”, 160); namedentities.put(“ndash”, 8211); namedentities.put(“ne”, 8800); namedentities.put(“ni”, 8715); namedentities.put(“not”, 172); namedentities.put(“notin”, 8713); namedentities.put(“nsub”, 8836); namedentities.put(“ntilde”, 241); namedentities.put(“nu”, 925); namedentities.put(“oacute”, 243); namedentities.put(“ocirc”, 244); namedentities.put(“oelig”, 339); namedentities.put(“ograve”, 242); namedentities.put(“oline”, 8254); namedentities.put(“omega”, 969); namedentities.put(“omicron”, 959); namedentities.put(“oplus”, 8853); namedentities.put(“or”, 8744); namedentities.put(“ordf”, 170); namedentities.put(“ordm”, 186); namedentities.put(“oslash”, 248); namedentities.put(“otilde”, 245); namedentities.put(“otimes”, 8855); namedentities.put(“ouml”, 246); namedentities.put(“para”, 182); namedentities.put(“part”, 8706); namedentities.put(“permil”, 8240); namedentities.put(“perp”, 8869); namedentities.put(“phi”, 966); namedentities.put(“pi”, 960); namedentities.put(“piv”, 982); namedentities.put(“plusmn”, 177); namedentities.put(“pound”, 163); namedentities.put(“prime”, 8242); namedentities.put(“prod”, 8719); namedentities.put(“prop”, 8733); namedentities.put(“psi”, 968); namedentities.put(“radic”, 8730); namedentities.put(“raquo”, 187); namedentities.put(“rarr”, 8594); namedentities.put(“rceil”, 8969); namedentities.put(“rdquo”, 8221); namedentities.put(“reg”, 174); namedentities.put(“rfloor”, 8971); namedentities.put(“rho”, 961); namedentities.put(“rlm”, 8207); namedentities.put(“rsaquo”, 8250); namedentities.put(“rsquo”, 8217); namedentities.put(“sbquo”, 8218); namedentities.put(“scaron”, 353); namedentities.put(“sdot”, 8901); namedentities.put(“sect”, 167); namedentities.put(“shy”, 173); namedentities.put(“sigma”, 963); namedentities.put(“sigmaf”, 962); namedentities.put(“sim”, 8764); namedentities.put(“spades”, 9824); namedentities.put(“sub”, 8834); namedentities.put(“sube”, 8838); namedentities.put(“sum”, 8721); namedentities.put(“sup”, 8835); namedentities.put(“sup1”, 185); namedentities.put(“sup3”, 179); namedentities.put(“supe”, 8839); namedentities.put(“szlig”, 223); namedentities.put(“tau”, 964); namedentities.put(“there4”, 8756); namedentities.put(“theta”, 952); namedentities.put(“thetasym”, 977); namedentities.put(“thinsp”, 8201); namedentities.put(“thorn”, 254); namedentities.put(“tilde”, 732); namedentities.put(“times”, 215); namedentities.put(“trade”, 8482); namedentities.put(“uacute”, 250); namedentities.put(“uarr”, 8593); namedentities.put(“ucirc”, 251); namedentities.put(“ugrave”, 249); namedentities.put(“uml”, 168); namedentities.put(“up2”, 178); namedentities.put(“upsih”, 978); namedentities.put(“upsilon”, 965); namedentities.put(“uuml”, 252); namedentities.put(“xi”, 958); namedentities.put(“yacute”, 253); namedentities.put(“yen”, 165); namedentities.put(“yuml”, 255); namedentities.put(“zeta”, 950); namedentities.put(“zwj”, 8205); namedentities.put(“zwnj”, 8204); emptytags.add(“area”); emptytags.add(“base”); emptytags.add(“basefont”); emptytags.add(“br”); emptytags.add(“col”); emptytags.add(“frame”); emptytags.add(“hr”); emptytags.add(“img”); emptytags.add(“input”); emptytags.add(“isindex”); emptytags.add(“link”); emptytags.add(“meta”); emptytags.add(“param”); autoclosetags.put(“basefont”, new ArrayList<String>()); autoclosetags.get(“basefont”).add(“basefont”); autoclosetags.put(“colgroup”, new ArrayList<String>()); autoclosetags.get(“colgroup”).add(“colgroup”); autoclosetags.put(“dd”, new ArrayList<String>()); autoclosetags.get(“dd”).add(“colgroup”); autoclosetags.put(“dt”, new ArrayList<String>()); autoclosetags.get(“dt”).add(“dt”); autoclosetags.put(“li”, new ArrayList<String>()); autoclosetags.get(“li”).add(“li”); autoclosetags.put(“p”, new ArrayList<String>()); autoclosetags.get(“p”).add(“p”); autoclosetags.put(“thead”, new ArrayList<String>()); autoclosetags.get(“thead”).add(“tbody”); autoclosetags.get(“thead”).add(“tfoot”); autoclosetags.put(“tbody”, new ArrayList<String>()); autoclosetags.get(“tbody”).add(“thead”); autoclosetags.get(“tbody”).add(“tfoot”); autoclosetags.put(“tfoot”, new ArrayList<String>()); autoclosetags.get(“tfoot”).add(“thead”); autoclosetags.get(“tfoot”).add(“tbody”); autoclosetags.put(“th”, new ArrayList<String>()); autoclosetags.get(“th”).add(“td”); autoclosetags.put(“td”, new ArrayList<String>()); autoclosetags.get(“td”).add(“th”); autoclosetags.get(“td”).add(“td”); autoclosetags.put(“tr”, new ArrayList<String>()); autoclosetags.get(“tr”).add(“tr”); String r2 = “”; String r = “”; int limit = s.length(); states state = states.text; states prevstate = state; Stack<String> opentags = new Stack<String>(); String name = “”; String tagname = “”; String attrname = “”; String attrs = “”; List<String> attrnames = new ArrayList<String>(); int entvalue = 0; char attrdelim = ‘”‘; String attrvalue = “”; String cs = “”; char prec = ‘ ‘; char preprec = ‘ ‘; char c = ‘ ‘; int start = 0; String encoding = “”; if (s.charAt(0) == 0xEF && s.charAt(1) == 0xBB && s.charAt(2)== 0xBF) { encoding = “utf-8”; start = 3; } else { encoding = “iso-8859-1”; start = 0; } for (int i = start; i < limit && ((r2.equals(“”) && r.equals(“”)) || !opentags.empty()); i++) { if (r.length() > 10240) { r2 += r; r = “”; } c = s.charAt(i); switch (state) { case text: if (c == ‘<‘) { name = “”; tagname = “”; attrname = “”; attrs = “”; attrnames.clear(); state = states.tag; break; } if (!Character.isWhitespace(c) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (Character.isWhitespace(c) && opentags.empty()) { break; } if (c == ‘&’) { name = “”; entvalue = 0; prevstate = state; state = states.entity; break; } r += c; break; case tag: if (c == ‘?’ && tagname.equals(“”)) { state = states.tillinst; break; } if (c == ‘!’ && tagname.equals(“”)) { state = states.specialtag; prec = ‘ ‘; break; } if (c == ‘/’ && name.equals(“”) && tagname.equals(“”)) { state = states.endtag; name = “”; break; } if (Character.isWhitespace(c)) { if (name.equals(“”)) { break; } if (tagname.equals(“”) && name != “_”) { tagname = name; name = “”; break; } if (attrname.equals(“”)) { attrname = name.toLowerCase(); name = “”; break; } break; } if (c == ‘=’) { if (attrname.equals(“”)) { attrname = name.toLowerCase(); name = “”; } state = states.tillquote; break; } if (c == ‘/’ && (!tagname.equals(“”) || !name.equals(“”))) { if (tagname.equals(“”)) { tagname = name; } tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) { r += “<tbody>”; opentags.push(“tbody”); } r += “<” + tagname + attrs + “/>”; state = states.tillgt; break; } if (c == ‘>’) { if (tagname.equals(“”) && !name.equals(“”)) { tagname = name; } if (!tagname.equals(“”)) { tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) { r += “<tbody>”; opentags.push(“tbody”); } if (emptytags.contains(tagname)) { r += “<” + tagname.toLowerCase() + attrs + “/>”; } else { opentags.push(tagname); r += “<” + tagname + attrs + “>”; if (tagname.equals(“script”)) { r += “<![CDATA[“; opentags.pop(); state = states.script; break; } } state = states.text; break; } } if (attrname.equals(“_”)) { while(attrnames.contains(attrname)) { attrname += “_”; } } if (!attrname.equals(“”) && !attrnames.contains(attrname) && !attrname.equals(“xmlns”)) { attrs += ” ” + attrname + “=\”” + attrname + “\””; attrname = “”; } cs = “” + c; name += (Character.isLetterOrDigit(c) && name != “”) || Character.isLetter(c) ? cs : (name.equals(“”) ? “_” : (c == ‘-‘ ? “-” : (!name.equals(“_”) ? “_” : “”))); break; case endtag: if (c == ‘>’) { name = name.toLowerCase(); if (opentags.search(name) != –1) { String prevtag; while (!(prevtag = opentags.pop()).equals(name)) { r += “</” + prevtag + “>”; } r += “</” + name + “>”; } else { if (!name.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } } state = states.text; break; } if (Character.isWhitespace(c)) { break; } cs = “” + c; name += Character.isLetterOrDigit(c) ? cs : !name.equals(“_”) ? “_” : “”; break; case attrtext: if (c == attrdelim || (Character.isWhitespace(c) && attrdelim == ‘ ‘)) { if (attrname.equals(“_”)) { while(attrnames.contains(attrname)) { attrname += “_”; } } if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) { attrnames.add(attrname); attrs += ” ” + attrname + “=\”” + attrvalue + “\””; } attrname = “”; state = states.tag; break; } if (attrdelim == ‘ ‘ && (c == ‘/’ || c == ‘>’)) { tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (attrname.equals(“_”)) { while(attrnames.contains(attrname)) { attrname += “_”; } } if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) { attrnames.add(attrname); attrs += ” ” + attrname + “=\”” + attrvalue + “\””; } attrname = “”; if (c == ‘/’) { r += “<” + tagname + attrs + “/>”; state = states.tillgt; break; } if (c == ‘>’) { if (emptytags.contains(tagname)) { r += “<” + tagname + attrs + “/>”; state = states.text; break; } else { opentags.push(tagname); r += “<” + tagname + attrs + “>”; if (tagname.equals(“script”)) { r += “<![CDATA[“; opentags.pop(); prec = ‘ ‘; preprec = ‘ ‘; state = states.script; break; } state = states.text; break; } } } if (c == ‘&’) { name = “”; entvalue = 0; prevstate = state; state = states.entity; break; } cs = “” + c; attrvalue += c == ‘”‘ ? “"” : c == ‘\” ? “'” : cs; break; case script: if (c == ‘/’ && prec == ‘<‘) { state = states.endscript; name = “”; break; } if (c == ‘[‘ && prec == ‘!’ && preprec == ‘<‘) { state = states.skipcdata; name = “<![“; break; } if (c == ‘>’ && prec == ‘]’ && preprec == ‘]’) { c = r.charAt(r.length() – 3); r = r.substring(0, r.length() – 4); } r += c; preprec = prec; prec = c; break; case endscript: if (c == ‘>’ && name.toLowerCase().equals(“script”)) { r = r.substring(0, r.length() – 1); r += “]]></script>”; state = states.text; break; } name += c; String sscr = “script”; if (!sscr.startsWith(name.toLowerCase())) { r += name; state = states.script; } break; case specialtag: if (c != ‘-‘) { state = states.tillgt; break; } if (prec == ‘-‘) { state = states.comment; preprec = ‘ ‘; break; } prec = c; break; case comment: if (c == ‘>’ && prec == ‘-‘ && preprec == ‘-‘) { state = states.text; break; } preprec = prec; prec = c; break; case skipcdata: if (name.equals(“<![CDATA[“)) { state = states.script; break; } name += c; String scdata = “<![CDATA[“; if (!scdata.startsWith(name)) { r += name; state = states.script; } break; case entity: if (c == ‘#’) { state = states.numericentity; break; } name += c; state = states.namedentity; break; case numericentity: if (c == ‘x’ || c == ‘X’) { state = states.hexaentity; break; } if (c == ‘;’) { String ent = “&#” + entvalue + “;”; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } entvalue = entvalue * 10 + c – ‘0’; break; case hexaentity: if (c == ‘;’) { String ent = “&#” + entvalue + “;”; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } entvalue = entvalue * 16 + (Character.isDigit(c) ? c – ‘0’ : Character.toUpperCase(c) – ‘A’); break; case namedentity: if (c == ‘;’) { String ent; name = name.toLowerCase(); if (name.equals(“amp”) || name.equals(“lt”) || name.equals(“gt”) || name.equals(“quot”) || name.equals(“apos”)) { ent = “&” + name + “;”; name = “”; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } if (namedentities.containsKey(name)) { entvalue = namedentities.get(name); } else { entvalue = 0; } ent = “&#” + entvalue + “;”; name = “”; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } if (!Character.isLetterOrDigit(c) || name.length() > 6) { String ent = “&” + name; name = “”; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; i–; break; } name += c; break; case tillinst: if (c == ‘?’) { state = states.andgt; } break; case andgt: if (c == ‘>’) { state = states.text; break; } state = states.tillinst; break; case tillgt: if (c == ‘>’) { state = states.text; } break; case tillquote: if (Character.isWhitespace(c)) { break; } if (c == ‘”‘ || c == ‘\”) { attrdelim = c; attrvalue = “”; state = states.attrtext; break; } if (c == ‘/’ || c == ‘>’) { if (attrname.equals(“_”)) { while(attrnames.contains(attrname)) { attrname += “_”; } } if (!attrnames.contains(attrname) && !attrname.equals(“xmlns”)) { attrnames.add(attrname); attrs += ” ” + attrname + “=\”” + attrvalue + “\””; } attrname = “”; } if (c == ‘/’) { r += “<” + tagname.toLowerCase() + attrs + “/>”; state = states.tillgt; break; } if (c == ‘>’) { tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (emptytags.contains(tagname)) { r += “<” + tagname + attrs + “/>”; state = states.text; break; } else { opentags.push(tagname); r += “<” + tagname + attrs + “>”; if (tagname.equals(“script”)) { r += “<![CDATA[“; opentags.pop(); state = states.script; break; } } } attrdelim = ‘ ‘; attrvalue = “” + c; state = states.attrtext; break; } } while (!opentags.empty()) { r += “</” + opentags.pop() + “>”; } r2 += r; return “<?xml version=\”1.0\” encoding=\”” + encoding + “\”?>\n” + r2; } /** * Ðọc 1 URL sau ðó chuyển thành XML rồi lưu xuống file * @param address là ðịa chỉ cần ðọc * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public void URL2XML(String address,String xmlFileName) throws Exception{ URL url=new URL(address); InputStream inStream = url.openStream(); Scanner in = new Scanner(inStream); String s=“”; while(in.hasNextLine()){ s+=in.nextLine()+ “\n”; } inStream.close(); in.close(); if(!xmlFileName.endsWith(“.xml”)) xmlFileName+=“.xml”; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(s)); pw.close(); } /** * Ðọc 1 file HTML trên ðĩa sau ðó biến thành xml file rồi lưu xuống ðĩa * @param htmlFilePath là đường dẫn tuyệt đối đến file html * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public void HtmlFile2XML(String htmlFilePath,String xmlFileName) throws Exception{ FileReader fr = new FileReader(htmlFilePath); BufferedReader br = new BufferedReader(fr); String s = “”; while (br.ready()) { s += br.readLine() + “\n”; } br.close(); fr.close(); if(!xmlFileName.endsWith(“.xml”)) xmlFileName+=“.xml”; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(s)); pw.close(); } /** * Chuyển 1 chuỗi html ra dạng xml * @param htmlString chuỗi chứa ðịnh dạng html * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public String StringPattern2XML(String htmlString,String xmlFileName,boolean toFile) throws Exception{ if(!toFile) return Convert2XML(htmlString); if(!xmlFileName.endsWith(“.xml”)) xmlFileName+=“.xml”; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(htmlString)); pw.close(); return “”; } /** * Testing * @param args */ public static void main (String[] args){ try { Html2Xml h2x=new Html2Xml(); String add=“http://www.bk4.com.vn”; System.out.println(“starting convert. Please wait…”); h2x.URL2XML(add, “xxxx.xml”); System.out.println(“convert completed…”); /*String s=”<html><body><body/></html>”;//<></> String kq=h2x.StringPattern2XML(s, “”, false); System.out.println(kq);*/ } catch (Exception e) { e.printStackTrace(); } } } |
Tài liệu tham khảo:
http://www.ibm.com/developerworks/web/library/x-html5xhtml2.html?S_TACT=105AGX08&S_CMP=EDU
Anhkho said
Vui lòng viết lại mã bằng C# đi bạn ui
Anhkho said
Xin lỗi đã lộn – Anh Hải Vui lòng Xó dùm
ly hue can said
cach hoc don gian de hieu nhat doi voi mon cong nghe phan mem la nhu the nao? trong mon cong nghe phan mem co nganh do hoa khong? trinh do toan va hoa hoi kem hoc tot mon nay ko?can phai lam gi de hoc tot mon cong nghe phan mem? mong thay giai dap nhung cau hoi tren dum em
Võ Văn Hải said
Bạn hỏi không đúng chủ đề rồi!
Câu hỏi của bạn cũng không rõ ràng: môn công nghệ phần mềm là 1 môn học sao lại có ngành đồ họa? không hiểu!
Bach said
Chao ban, Minh dang tim tren mang cach giai quyet van de cua minh va tinh co vao Blog cua ban. Minh co mot van de nho rat mong ban tra loi giup:
Minh dang phai Test mot Webportal. Tren trang web nay co rat nhieu Buttons va cac Fields. Minh can phai kiem tra chung. Lay vi du the nay: Tren trang web do co mot Field co noi dung nhu sau: “Xe nay co bien so la: B123456”. Bay gio minh phai viet mot Methode Test bang C# va dua ra man hinh bien so nay.
Van de la: de dinh vi duoc Field nay, minh dung FireBug. Sau do di chuyen chuot den Field do tren Web, bang cach nay minh se thu duoc 2 kieu du lieu ve Field nay. Do la Ma XPATH va Ma HTML. Minh khong biet phai lam the nao de lay duoc Field nay ra, sau do loc lay bien so “B123456” dua ra man hinh thong qua XPATH hoac HTML
Ban co the goi y cho minh duoc khong ?
Cam on ban rat nhieu. Sorry vi may tinh cua minh khong viet duoc Tieng Viet
Ngọc Hường said
Thầy ơi! Cái file xml vừa được chuyển từ url = “http://dantri.com.vn/”. bị lỗi font rồi Thầy ơi!
Ngọc Hường said
Thầy ơi! em chuyển font lại được gồi. Đã phiền Thầy