Chuyển Html sang XML
Sau đây là lớp với các phương thức để chuyển các url/file/text dạng HTML sang XML.
Bài viết tham khảo từ trang http://sourceforge.net/projects/light-html2xml sau đó thêm các phương thức utils.
Chúc các bạn thành công!
|
package vovanhai.wordpress.com; import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Scanner; import java.util.Stack; public class Html2Xml { enum states {text, tag, endtag, attrtext, script, endscript, specialtag, comment, skipcdata, entity, namedentity, numericentity, hexaentity, tillgt, tillquote, tillinst, andgt}; private static HashMap<String, Integer> namedentities = new HashMap<String, Integer>(); private static List<String> emptytags = new ArrayList<String>(); private static HashMap<String, List<String>> autoclosetags = new HashMap<String, List<String>>(); private String Convert2XML(String s) { namedentities.put(“AElig”, 198); namedentities.put(“Aacute”, 193); namedentities.put(“Acirc”, 194); namedentities.put(“Agrave”, 192); namedentities.put(“Alpha”, 913); namedentities.put(“Aring”, 197); namedentities.put(“Atilde”, 195); namedentities.put(“Auml”, 196); namedentities.put(“Beta”, 914); namedentities.put(“Ccedil”, 199); namedentities.put(“Chi”, 935); namedentities.put(“Dagger”, 8225); namedentities.put(“Delta”, 916); namedentities.put(“ETH”, 208); namedentities.put(“Eacute”, 201); namedentities.put(“Ecirc”, 202); namedentities.put(“Egrave”, 200); namedentities.put(“Epsilon”, 917); namedentities.put(“Eta”, 919); namedentities.put(“Euml”, 203); namedentities.put(“Gamma”, 915); namedentities.put(“Iacute”, 205); namedentities.put(“Icirc”, 206); namedentities.put(“Igrave”, 204); namedentities.put(“Iota”, 921); namedentities.put(“Iuml”, 207); namedentities.put(“Kappa”, 922); namedentities.put(“Lambda”, 923); namedentities.put(“Mu”, 924); namedentities.put(“Ntilde”, 209); namedentities.put(“Nu”, 925); namedentities.put(“OElig”, 338); namedentities.put(“Oacute”, 211); namedentities.put(“Ocirc”, 212); namedentities.put(“Ograve”, 210); namedentities.put(“Omega”, 937); namedentities.put(“Omicron”, 927); namedentities.put(“Oslash”, 216); namedentities.put(“Otilde”, 213); namedentities.put(“Ouml”, 214); namedentities.put(“Phi”, 934); namedentities.put(“Pi”, 928); namedentities.put(“Prime”, 8243); namedentities.put(“Psi”, 936); namedentities.put(“Rho”, 929); namedentities.put(“Scaron”, 352); namedentities.put(“Sigma”, 931); namedentities.put(“THORN”, 222); namedentities.put(“Tau”, 932); namedentities.put(“Theta”, 920); namedentities.put(“Uacute”, 218); namedentities.put(“Ucirc”, 219); namedentities.put(“Ugrave”, 217); namedentities.put(“Upsilon”, 933); namedentities.put(“Uuml”, 220); namedentities.put(“Xi”, 926); namedentities.put(“Yacute”, 221); namedentities.put(“Yuml”, 376); namedentities.put(“Zeta”, 918); namedentities.put(“aacute”, 225); namedentities.put(“acirc”, 226); namedentities.put(“acute”, 180); namedentities.put(“aelig”, 230); namedentities.put(“agrave”, 224); namedentities.put(“alpha”, 945); namedentities.put(“and”, 8743); namedentities.put(“ang”, 8736); namedentities.put(“aring”, 229); namedentities.put(“asymp”, 8776); namedentities.put(“atilde”, 227); namedentities.put(“auml”, 228); namedentities.put(“bdquo”, 8222); namedentities.put(“beta”, 946); namedentities.put(“brvbar”, 166); namedentities.put(“bull”, 8226); namedentities.put(“cap”, 8745); namedentities.put(“ccedil”, 231); namedentities.put(“cedil”, 184); namedentities.put(“cent”, 162); namedentities.put(“chi”, 967); namedentities.put(“circ”, 710); namedentities.put(“clubs”, 9827); namedentities.put(“cong”, 8773); namedentities.put(“copy”, 169); namedentities.put(“crarr”, 8629); namedentities.put(“cup”, 8746); namedentities.put(“curren”, 164); namedentities.put(“dagger”, 8224); namedentities.put(“darr”, 8595); namedentities.put(“deg”, 176); namedentities.put(“delta”, 948); namedentities.put(“diams”, 9830); namedentities.put(“divide”, 247); namedentities.put(“eacute”, 233); namedentities.put(“ecirc”, 234); namedentities.put(“egrave”, 232); namedentities.put(“empty”, 8709); namedentities.put(“emsp”, 8195); namedentities.put(“ensp”, 8194); namedentities.put(“epsilon”, 949); namedentities.put(“equiv”, 8801); namedentities.put(“eta”, 951); namedentities.put(“eth”, 240); namedentities.put(“euml”, 235); namedentities.put(“euro”, 8364); namedentities.put(“exists”, 8707); namedentities.put(“fnof”, 402); namedentities.put(“forall”, 8704); namedentities.put(“frac12″, 189); namedentities.put(“frac14″, 188); namedentities.put(“frac34″, 190); namedentities.put(“gamma”, 947); namedentities.put(“ge”, 8805); namedentities.put(“harr”, 8596); namedentities.put(“hearts”, 9829); namedentities.put(“hellip”, 8230); namedentities.put(“iacute”, 237); namedentities.put(“icirc”, 238); namedentities.put(“iexcl”, 161); namedentities.put(“igrave”, 236); namedentities.put(“infin”, 8734); namedentities.put(“int”, 8747); namedentities.put(“iota”, 953); namedentities.put(“iquest”, 191); namedentities.put(“isin”, 8712); namedentities.put(“iuml”, 239); namedentities.put(“kappa”, 954); namedentities.put(“lambda”, 923); namedentities.put(“laquo”, 171); namedentities.put(“larr”, 8592); namedentities.put(“lceil”, 8968); namedentities.put(“ldquo”, 8220); namedentities.put(“le”, 8804); namedentities.put(“lfloor”, 8970); namedentities.put(“lowast”, 8727); namedentities.put(“loz”, 9674); namedentities.put(“lrm”, 8206); namedentities.put(“lsaquo”, 8249); namedentities.put(“lsquo”, 8216); namedentities.put(“macr”, 175); namedentities.put(“mdash”, 8212); namedentities.put(“micro”, 181); namedentities.put(“middot”, 183); namedentities.put(“minus”, 8722); namedentities.put(“mu”, 956); namedentities.put(“nabla”, 8711); namedentities.put(“nbsp”, 160); namedentities.put(“ndash”, 8211); namedentities.put(“ne”, 8800); namedentities.put(“ni”, 8715); namedentities.put(“not”, 172); namedentities.put(“notin”, 8713); namedentities.put(“nsub”, 8836); namedentities.put(“ntilde”, 241); namedentities.put(“nu”, 925); namedentities.put(“oacute”, 243); namedentities.put(“ocirc”, 244); namedentities.put(“oelig”, 339); namedentities.put(“ograve”, 242); namedentities.put(“oline”, 8254); namedentities.put(“omega”, 969); namedentities.put(“omicron”, 959); namedentities.put(“oplus”, 8853); namedentities.put(“or”, 8744); namedentities.put(“ordf”, 170); namedentities.put(“ordm”, 186); namedentities.put(“oslash”, 248); namedentities.put(“otilde”, 245); namedentities.put(“otimes”, 8855); namedentities.put(“ouml”, 246); namedentities.put(“para”, 182); namedentities.put(“part”, 8706); namedentities.put(“permil”, 8240); namedentities.put(“perp”, 8869); namedentities.put(“phi”, 966); namedentities.put(“pi”, 960); namedentities.put(“piv”, 982); namedentities.put(“plusmn”, 177); namedentities.put(“pound”, 163); namedentities.put(“prime”, 8242); namedentities.put(“prod”, 8719); namedentities.put(“prop”, 8733); namedentities.put(“psi”, 968); namedentities.put(“radic”, 8730); namedentities.put(“raquo”, 187); namedentities.put(“rarr”, 8594); namedentities.put(“rceil”, 8969); namedentities.put(“rdquo”, 8221); namedentities.put(“reg”, 174); namedentities.put(“rfloor”, 8971); namedentities.put(“rho”, 961); namedentities.put(“rlm”, 8207); namedentities.put(“rsaquo”, 8250); namedentities.put(“rsquo”, 8217); namedentities.put(“sbquo”, 8218); namedentities.put(“scaron”, 353); namedentities.put(“sdot”, 8901); namedentities.put(“sect”, 167); namedentities.put(“shy”, 173); namedentities.put(“sigma”, 963); namedentities.put(“sigmaf”, 962); namedentities.put(“sim”, 8764); namedentities.put(“spades”, 9824); namedentities.put(“sub”, 8834); namedentities.put(“sube”, 8838); namedentities.put(“sum”, 8721); namedentities.put(“sup”, 8835); namedentities.put(“sup1″, 185); namedentities.put(“sup3″, 179); namedentities.put(“supe”, 8839); namedentities.put(“szlig”, 223); namedentities.put(“tau”, 964); namedentities.put(“there4″, 8756); namedentities.put(“theta”, 952); namedentities.put(“thetasym”, 977); namedentities.put(“thinsp”, 8201); namedentities.put(“thorn”, 254); namedentities.put(“tilde”, 732); namedentities.put(“times”, 215); namedentities.put(“trade”, 8482); namedentities.put(“uacute”, 250); namedentities.put(“uarr”, 8593); namedentities.put(“ucirc”, 251); namedentities.put(“ugrave”, 249); namedentities.put(“uml”, 168); namedentities.put(“up2″, 178); namedentities.put(“upsih”, 978); namedentities.put(“upsilon”, 965); namedentities.put(“uuml”, 252); namedentities.put(“xi”, 958); namedentities.put(“yacute”, 253); namedentities.put(“yen”, 165); namedentities.put(“yuml”, 255); namedentities.put(“zeta”, 950); namedentities.put(“zwj”, 8205); namedentities.put(“zwnj”, 8204); emptytags.add(“area”); emptytags.add(“base”); emptytags.add(“basefont”); emptytags.add(“br”); emptytags.add(“col”); emptytags.add(“frame”); emptytags.add(“hr”); emptytags.add(“img”); emptytags.add(“input”); emptytags.add(“isindex”); emptytags.add(“link”); emptytags.add(“meta”); emptytags.add(“param”); autoclosetags.put(“basefont”, new ArrayList<String>()); autoclosetags.get(“basefont”).add(“basefont”); autoclosetags.put(“colgroup”, new ArrayList<String>()); autoclosetags.get(“colgroup”).add(“colgroup”); autoclosetags.put(“dd”, new ArrayList<String>()); autoclosetags.get(“dd”).add(“colgroup”); autoclosetags.put(“dt”, new ArrayList<String>()); autoclosetags.get(“dt”).add(“dt”); autoclosetags.put(“li”, new ArrayList<String>()); autoclosetags.get(“li”).add(“li”); autoclosetags.put(“p”, new ArrayList<String>()); autoclosetags.get(“p”).add(“p”); autoclosetags.put(“thead”, new ArrayList<String>()); autoclosetags.get(“thead”).add(“tbody”); autoclosetags.get(“thead”).add(“tfoot”); autoclosetags.put(“tbody”, new ArrayList<String>()); autoclosetags.get(“tbody”).add(“thead”); autoclosetags.get(“tbody”).add(“tfoot”); autoclosetags.put(“tfoot”, new ArrayList<String>()); autoclosetags.get(“tfoot”).add(“thead”); autoclosetags.get(“tfoot”).add(“tbody”); autoclosetags.put(“th”, new ArrayList<String>()); autoclosetags.get(“th”).add(“td”); autoclosetags.put(“td”, new ArrayList<String>()); autoclosetags.get(“td”).add(“th”); autoclosetags.get(“td”).add(“td”); autoclosetags.put(“tr”, new ArrayList<String>()); autoclosetags.get(“tr”).add(“tr”); String r2 = “”; String r = “”; int limit = s.length(); states state = states.text; states prevstate = state; Stack<String> opentags = new Stack<String>(); String name = “”; String tagname = “”; String attrname = “”; String attrs = “”; List<String> attrnames = new ArrayList<String>(); int entvalue = 0; char attrdelim = ‘”‘; String attrvalue = “”; String cs = “”; char prec = ‘ ’; char preprec = ‘ ’; char c = ‘ ’; int start = 0; String encoding = “”; if (s.charAt(0) == 0xEF && s.charAt(1) == 0xBB && s.charAt(2)== 0xBF) { encoding = “utf-8″; start = 3; } else { encoding = “iso-8859-1″; start = 0; } for (int i = start; i < limit && ((r2.equals(“”) && r.equals(“”)) || !opentags.empty()); i++) { if (r.length() > 10240) { r2 += r; r = “”; } c = s.charAt(i); switch (state) { case text: if (c == ‘<’) { name = “”; tagname = “”; attrname = “”; attrs = “”; attrnames.clear(); state = states.tag; break; } if (!Character.isWhitespace(c) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (Character.isWhitespace(c) && opentags.empty()) { break; } if (c == ‘&’) { name = “”; entvalue = 0; prevstate = state; state = states.entity; break; } r += c; break; case tag: if (c == ‘?’ && tagname.equals(“”)) { state = states.tillinst; break; } if (c == ‘!’ && tagname.equals(“”)) { state = states.specialtag; prec = ‘ ’; break; } if (c == ‘/’ && name.equals(“”) && tagname.equals(“”)) { state = states.endtag; name = “”; break; } if (Character.isWhitespace(c)) { if (name.equals(“”)) { break; } if (tagname.equals(“”) && name != “_”) { tagname = name; name = “”; break; } if (attrname.equals(“”)) { attrname = name.toLowerCase(); name = “”; break; } break; } if (c == ‘=’) { if (attrname.equals(“”)) { attrname = name.toLowerCase(); name = “”; } state = states.tillquote; break; } if (c == ‘/’ && (!tagname.equals(“”) || !name.equals(“”))) { if (tagname.equals(“”)) { tagname = name; } tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) { r += “<tbody>”; opentags.push(“tbody”); } r += “<” + tagname + attrs + “/>”; state = states.tillgt; break; } if (c == ‘>’) { if (tagname.equals(“”) && !name.equals(“”)) { tagname = name; } if (!tagname.equals(“”)) { tagname = tagname.toLowerCase(); if (!tagname.equals(“html”) && opentags.empty()) { r += “<html>”; opentags.push(“html”); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += “</” + prevtag + “>”; } } if (tagname.equals(“tr”) && opentags.peek().equals(“table”)) { r += “<tbody>”; opentags.push(“tbody”); } if (emptytags.contains(tagname)) { r += “<” + tagname.toLowerCase() + attrs + “/>”; } else { opentags.push(tagname); r += “<” + tagname + attrs + “>”; if (tagname.equals(“script”)) { r += “<![CDATA["; opentags.pop(); state = states.script; break; } } state = states.text; break; } } if (attrname.equals("_")) { while(attrnames.contains(attrname)) { attrname += "_"; } } if (!attrname.equals("") && !attrnames.contains(attrname) && !attrname.equals("xmlns")) { attrs += " " + attrname + "=\"" + attrname + "\""; attrname = ""; } cs = "" + c; name += (Character.isLetterOrDigit(c) && name != "") || Character.isLetter(c) ? cs : (name.equals("") ? "_" : (c == '-' ? "-" : (!name.equals("_") ? "_" : ""))); break; case endtag: if (c == '>') { name = name.toLowerCase(); if (opentags.search(name) != -1) { String prevtag; while (!(prevtag = opentags.pop()).equals(name)) { r += "</" + prevtag + ">"; } r += "</" + name + ">"; } else { if (!name.equals("html") && opentags.empty()) { r += "<html>"; opentags.push("html"); } } state = states.text; break; } if (Character.isWhitespace(c)) { break; } cs = "" + c; name += Character.isLetterOrDigit(c) ? cs : !name.equals("_") ? "_" : ""; break; case attrtext: if (c == attrdelim || (Character.isWhitespace(c) && attrdelim == ' ')) { if (attrname.equals("_")) { while(attrnames.contains(attrname)) { attrname += "_"; } } if (!attrnames.contains(attrname) && !attrname.equals("xmlns")) { attrnames.add(attrname); attrs += " " + attrname + "=\"" + attrvalue + "\""; } attrname = ""; state = states.tag; break; } if (attrdelim == ' ' && (c == '/' || c == '>')) { tagname = tagname.toLowerCase(); if (!tagname.equals("html") && opentags.empty()) { r += "<html>"; opentags.push("html"); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += "</" + prevtag + ">"; } } if (attrname.equals("_")) { while(attrnames.contains(attrname)) { attrname += "_"; } } if (!attrnames.contains(attrname) && !attrname.equals("xmlns")) { attrnames.add(attrname); attrs += " " + attrname + "=\"" + attrvalue + "\""; } attrname = ""; if (c == '/') { r += "<" + tagname + attrs + "/>"; state = states.tillgt; break; } if (c == '>') { if (emptytags.contains(tagname)) { r += "<" + tagname + attrs + "/>"; state = states.text; break; } else { opentags.push(tagname); r += "<" + tagname + attrs + ">"; if (tagname.equals("script")) { r += "<![CDATA["; opentags.pop(); prec = ' '; preprec = ' '; state = states.script; break; } state = states.text; break; } } } if (c == '&') { name = ""; entvalue = 0; prevstate = state; state = states.entity; break; } cs = "" + c; attrvalue += c == '"' ? """ : c == '\'' ? "'" : cs; break; case script: if (c == '/' && prec == '<') { state = states.endscript; name = ""; break; } if (c == '[' && prec == '!' && preprec == '<') { state = states.skipcdata; name = "<!["; break; } if (c == '>' && prec == ']‘ && preprec == ‘]’) { c = r.charAt(r.length() - 3); r = r.substring(0, r.length() - 4); } r += c; preprec = prec; prec = c; break; case endscript: if (c == ‘>’ && name.toLowerCase().equals(“script”)) { r = r.substring(0, r.length() - 1); r += “]]></script>”; state = states.text; break; } name += c; String sscr = “script”; if (!sscr.startsWith(name.toLowerCase())) { r += name; state = states.script; } break; case specialtag: if (c != ‘-’) { state = states.tillgt; break; } if (prec == ‘-’) { state = states.comment; preprec = ‘ ’; break; } prec = c; break; case comment: if (c == ‘>’ && prec == ‘-’ && preprec == ‘-’) { state = states.text; break; } preprec = prec; prec = c; break; case skipcdata: if (name.equals(“<![CDATA[")) { state = states.script; break; } name += c; String scdata = "<![CDATA["; if (!scdata.startsWith(name)) { r += name; state = states.script; } break; case entity: if (c == '#') { state = states.numericentity; break; } name += c; state = states.namedentity; break; case numericentity: if (c == 'x' || c == 'X') { state = states.hexaentity; break; } if (c == ';') { String ent = "&#" + entvalue + ";"; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } entvalue = entvalue * 10 + c - '0'; break; case hexaentity: if (c == ';') { String ent = "&#" + entvalue + ";"; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } entvalue = entvalue * 16 + (Character.isDigit(c) ? c - '0' : Character.toUpperCase(c) - 'A'); break; case namedentity: if (c == ';') { String ent; name = name.toLowerCase(); if (name.equals("amp") || name.equals("lt") || name.equals("gt") || name.equals("quot") || name.equals("apos")) { ent = "&" + name + ";"; name = ""; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } if (namedentities.containsKey(name)) { entvalue = namedentities.get(name); } else { entvalue = 0; } ent = "&#" + entvalue + ";"; name = ""; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; break; } if (!Character.isLetterOrDigit(c) || name.length() > 6) { String ent = "&" + name; name = ""; if (prevstate == states.text) { r += ent; } else { attrvalue += ent; } state = prevstate; i--; break; } name += c; break; case tillinst: if (c == '?') { state = states.andgt; } break; case andgt: if (c == '>') { state = states.text; break; } state = states.tillinst; break; case tillgt: if (c == '>') { state = states.text; } break; case tillquote: if (Character.isWhitespace(c)) { break; } if (c == '"' || c == '\'') { attrdelim = c; attrvalue = ""; state = states.attrtext; break; } if (c == '/' || c == '>') { if (attrname.equals("_")) { while(attrnames.contains(attrname)) { attrname += "_"; } } if (!attrnames.contains(attrname) && !attrname.equals("xmlns")) { attrnames.add(attrname); attrs += " " + attrname + "=\"" + attrvalue + "\""; } attrname = ""; } if (c == '/') { r += "<" + tagname.toLowerCase() + attrs + "/>"; state = states.tillgt; break; } if (c == '>') { tagname = tagname.toLowerCase(); if (!tagname.equals("html") && opentags.empty()) { r += "<html>"; opentags.push("html"); } if (autoclosetags.containsKey(tagname) && !opentags.empty()) { String prevtag = opentags.peek(); if (autoclosetags.get(tagname).contains(prevtag)) { opentags.pop(); r += "</" + prevtag + ">"; } } if (emptytags.contains(tagname)) { r += "<" + tagname + attrs + "/>"; state = states.text; break; } else { opentags.push(tagname); r += "<" + tagname + attrs + ">"; if (tagname.equals("script")) { r += "<![CDATA["; opentags.pop(); state = states.script; break; } } } attrdelim = ' '; attrvalue = "" + c; state = states.attrtext; break; } } while (!opentags.empty()) { r += "</" + opentags.pop() + ">"; } r2 += r; return "<?xml version=\"1.0\" encoding=\"" + encoding + "\"?>\n" + r2; } /** * Ðọc 1 URL sau ðó chuyển thành XML rồi lưu xuống file * @param address là ðịa chỉ cần ðọc * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public void URL2XML(String address,String xmlFileName) throws Exception{ URL url=new URL(address); InputStream inStream = url.openStream(); Scanner in = new Scanner(inStream); String s=""; while(in.hasNextLine()){ s+=in.nextLine()+ "\n"; } inStream.close(); in.close(); if(!xmlFileName.endsWith(".xml")) xmlFileName+=".xml"; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(s)); pw.close(); } /** * Ðọc 1 file HTML trên ðĩa sau ðó biến thành xml file rồi lưu xuống ðĩa * @param htmlFilePath là đường dẫn tuyệt đối đến file html * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public void HtmlFile2XML(String htmlFilePath,String xmlFileName) throws Exception{ FileReader fr = new FileReader(htmlFilePath); BufferedReader br = new BufferedReader(fr); String s = ""; while (br.ready()) { s += br.readLine() + "\n"; } br.close(); fr.close(); if(!xmlFileName.endsWith(".xml")) xmlFileName+=".xml"; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(s)); pw.close(); } /** * Chuyển 1 chuỗi html ra dạng xml * @param htmlString chuỗi chứa ðịnh dạng html * @param xmlFileName là file xml chỉ ðịnh ðể lưu * @throws Exception */ public String StringPattern2XML(String htmlString,String xmlFileName,boolean toFile) throws Exception{ if(!toFile) return Convert2XML(htmlString); if(!xmlFileName.endsWith(".xml")) xmlFileName+=".xml"; FileWriter fw = new FileWriter(xmlFileName); PrintWriter pw = new PrintWriter(fw,true); pw.print(Convert2XML(htmlString)); pw.close(); return ""; } /** * Testing * @param args */ public static void main (String[] args){ try { Html2Xml h2x=new Html2Xml(); String add=“http://www.bk4.com.vn”; System.out.println(“starting convert. Please wait…”); h2x.URL2XML(add, “xxxx.xml”); System.out.println(“convert completed…”); /*String s=”<html><body><body/></html>”;//<></> String kq=h2x.StringPattern2XML(s, ”", false); System.out.println(kq);*/ } catch (Exception e) { e.printStackTrace(); } } } |
Tài liệu tham khảo:
http://www.ibm.com/developerworks/web/library/x-html5xhtml2.html?S_TACT=105AGX08&S_CMP=EDU
Anhkho said
Vui lòng viết lại mã bằng C# đi bạn ui
Anhkho said
Xin lỗi đã lộn – Anh Hải Vui lòng Xó dùm
ly hue can said
cach hoc don gian de hieu nhat doi voi mon cong nghe phan mem la nhu the nao? trong mon cong nghe phan mem co nganh do hoa khong? trinh do toan va hoa hoi kem hoc tot mon nay ko?can phai lam gi de hoc tot mon cong nghe phan mem? mong thay giai dap nhung cau hoi tren dum em
Võ Văn Hải said
Bạn hỏi không đúng chủ đề rồi!
Câu hỏi của bạn cũng không rõ ràng: môn công nghệ phần mềm là 1 môn học sao lại có ngành đồ họa? không hiểu!