国产精品99久久久久久www,日韩视频在线一区,国产精品视频大全

本文實例講述了JAVA使用爬蟲抓取網站網頁內容的方法。分享給大家供大家參考。具體如下：

最近在用JAVA研究下爬網技術,呵呵，入了個門,把自己的心得和大家分享下
以下提供二種方法，一種是用apache提供的包．另一種是用JAVA自帶的.

代碼如下:

				?

									// 第一種方法

									//這種方法是用apache提供的包,簡單方便

									//但是要用到以下包:commons-codec-1.4.jar

									// commons-httpclient-3.1.jar

									// commons-logging-1.0.4.jar

									public static String createhttpClient(String url, String param) {

									  HttpClient client = new HttpClient();

									  String response = null;

									  String keyword = null;

									  PostMethod postMethod = new PostMethod(url);

									//  try {

									//   if (param != null)

									//    keyword = new String(param.getBytes("gb2312"), "ISO-8859-1");

									//  } catch (UnsupportedEncodingException e1) {

									//   // TODO Auto-generated catch block

									//   e1.printStackTrace();

									//  }

									  // NameValuePair[] data = { new NameValuePair("keyword", keyword) };

									  // // 將表單的值放入postMethod中

									  // postMethod.setRequestBody(data);

									  // 以上部分是帶參數抓取,我自己把它注銷了．大家可以把注銷消掉研究下

									  try {

									   int statusCode = client.executeMethod(postMethod);

									   response = new String(postMethod.getResponseBodyAsString()

									     .getBytes("ISO-8859-1"), "gb2312");

									     //這里要注意下 gb2312要和你抓取網頁的編碼要一樣

									   String p = response.replaceAll("//&[a-zA-Z]{1,10};", "")

									     .replaceAll("<[^>]*>", "");//去掉網頁中帶有html語言的標簽

									   System.out.println(p);

									  } catch (Exception e) {

									   e.printStackTrace();

									  }

									  return response;

									}

									// 第二種方法

									// 這種方法是JAVA自帶的URL來抓取網站內容

									public String getPageContent(String strUrl, String strPostRequest,

									   int maxLength) {

									  // 讀取結果網頁

									  StringBuffer buffer = new StringBuffer();

									  System.setProperty("sun.net.client.defaultConnectTimeout", "5000");

									  System.setProperty("sun.net.client.defaultReadTimeout", "5000");

									  try {

									   URL newUrl = new URL(strUrl);

									   HttpURLConnection hConnect = (HttpURLConnection) newUrl

									     .openConnection();

									   // POST方式的額外數據

									   if (strPostRequest.length() > 0) {

									    hConnect.setDoOutput(true);

									    OutputStreamWriter out = new OutputStreamWriter(hConnect

									      .getOutputStream());

									    out.write(strPostRequest);

									    out.flush();

									    out.close();

									   }

									   // 讀取內容

									   BufferedReader rd = new BufferedReader(new InputStreamReader(

									     hConnect.getInputStream()));

									   int ch;

									   for (int length = 0; (ch = rd.read()) > -1

									     && (maxLength <= 0 || length < maxLength); length++)

									    buffer.append((char) ch);

									   String s = buffer.toString();

									   s.replaceAll("//&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");

									   System.out.println(s);

									   rd.close();

									   hConnect.disconnect();

									   return buffer.toString().trim();

									  } catch (Exception e) {

									   // return "錯誤:讀取網頁失敗！";

									   //

									   return null;

									  }

									}

然后寫個測試類:

				?

									public static void main(String[] args) {

									  String url = "//www.ythuaji.com.cn";

									  String keyword = "服務器之家";

									  createhttpClient p = new createhttpClient();

									  String response = p.createhttpClient(url, keyword);

									  // 第一種方法

									  // p.getPageContent(url, "post", 100500);//第二種方法

									}