HTMLParser是目前Java领域中解析HTML应用比较广泛的一个。
HTMLParser的主页是http://htmlparser.sourceforge.net/
初次接触HTML Parser,它的核心模块是org.htmlparser.Parser类
介绍几种Parser 初始化的方法,详细见代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
package com.htmlparser; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; /** * * @author Michael */ public class TestMain { private static String ENCODE = "UTF-8"; /** * @param args */ public static void main(String[] args) { TestMain test = new TestMain(); // String url = // "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nero9%E5%88%BB%E5%BD%95ape&aq=f&aqi=&aql=&oq=&gs_rfai="; String url = "http://www.baidu.com/s?wd=nero9%BF%CC%C2%BCape&oq=nero9k&rsp=1&f=3&sugT=6679"; test.testNodeFilter(url); } /** * 几种初始化的方法 */ private void testInitParser() { try { Parser parser1 = new Parser(); parser1.setURL("http://www.baidu.com"); parser1.setEncoding(parser1.getEncoding()); // url 初始化的方法 HttpURLConnection.setFollowRedirects(true); URL netUrl = new URL("http://www.baidu.com"); HttpURLConnection con = (HttpURLConnection) netUrl.openConnection(); con.setInstanceFollowRedirects(false); con.connect(); Parser parser2 = new Parser(con); // 根据字符串初始化 String htmlString = this.readHtmlFile("d:/test/test.html"); Parser parser3 = Parser.createParser(htmlString, ENCODE); // 根据字符串初始化 String htmlStr1 = "<html><head><title>Test</title>" + "<link href=’/test01/css.css' text='text/css' rel='stylesheet'/>" + "</head><body><div><a href='www.baidu.com' target='_blank'>baidu</a></div>" + "<div><a href='www.sina.com' target='_blank'>sina</a></div></body></html>"; Parser parser4 = new Parser(htmlString); } catch (Exception e) { e.printStackTrace(); } } /** * NodeFilter * @param url */ private void testNodeFilter(String url) { System.out.println("NodeFilter start..."); try { HttpURLConnection.setFollowRedirects(true); URL netUrl = new URL(url); HttpURLConnection con = (HttpURLConnection) netUrl.openConnection(); con.setInstanceFollowRedirects(false); con.connect(); Parser parser = new Parser(con); parser.setEncoding(parser.getEncoding()); NodeFilter filter = new TagNameFilter("A"); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { System.out.println(list.elementAt(i).toHtml()); } } catch (Exception e) { e.printStackTrace(); } System.out.println("NodeFilter end"); } /** * 读取HTML文件 * @param htmlFileName * @return */ private String readHtmlFile(String htmlFileName) { BufferedReader bis = null; try { bis = new BufferedReader(new InputStreamReader(new FileInputStream( new File(htmlFileName)), ENCODE)); StringBuffer htmlsb = new StringBuffer(); String readTemp; while ((readTemp = bis.readLine()) != null) { htmlsb.append(readTemp); } bis.close(); return htmlsb.toString(); } catch (Exception e) { return null; } finally { if (null != bis) { try { bis.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } } } |
原创文章,转载请注明: 转载自micmiu – 软件开发+生活点滴[ http://www.micmiu.com/ ]
0 条评论。