0
http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012
곧 특정 웹 사이트에 대한 웹 크롤러를 만들기 위해 노력하고는, 페이지 (자바 스크립트를 사용)를 변경 링크를 찾을 수 내 XPath는 아니다 NullPointExecetion을 유발하는 작업. 여러 가지 방법으로 XPath를 수정하려했지만 아무 것도 효과가 없었습니다.
또한 스크립트를 실행 한 후 새 페이지를 가져 오는 방법이 필요합니까?
> package gimasys.webService;
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.ThreadedRefreshHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlButton;
import com.gargoylesoftware.htmlunit.html.HtmlLink;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Crawlv1000 {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
final WebCrawler wc = new WebCrawler();
final PageCrawler pc = new PageCrawler();
final WebClient webClient = new WebClient(BrowserVersion.CHROME_16);
webClient.setRefreshHandler(new ThreadedRefreshHandler()); // This is to allow handling the page operation using threads else an exception will pop up
try {
HtmlPage page = webClient.getPage("http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012");
HtmlAnchor link = page.getFirstByXPath("//a[@href='javascript:loadRankingTable(3)']");
link.click();
System.out.println(page.getTextContent());
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/*
wc.crawl("http://v1000.vn/bang-xep-hang?ref=bang-xep-hang-1000-doanh-nghiep-dong-thue-thu-nhap-nhieu-nhat-2012");
for (String url:wc.urlList)
{
pc.crawl(url);
}
*/
}
}
감사합니다, 호치민 응웬