JAVA爬虫爬取携程酒店数据selenium实现
在爬取携程的时候碰到很多的壁垒,接下来分析所有过程
1.根据以往经验最初想到用jsoup去解析每个HTML元素,然后拿到酒店数据,然后发现解析HTML根本拿不到id为hotel_list的div,所以也就无法通过静态的HTML去获取数据
可以看到标签里面根据就是没有数据的,因为这里的数据是动态的所有无法拿取,接下来采用动态拿取
2.第一种方法就不行, 于是疯狂查博文,找到了携程动态数据的接口,在AjaxHotelList.aspx里我找到了酒店,里面有HTML的代码拼接,数据都在这里了,怎么拿取呢?
模拟post请求,然后拿数据
在发送请求的时候注意下图红框中的信息。请求头必须要加上来源信息和游览器信息。发送的参数就是Form Data里的数据,可以只传部分数据。
请求发送后,很遗憾还是没有拿到数据,可能是一些加密的处理。
3.虽然模拟请求拿不到数据,但是大致方向还是找到了,还剩一下一种办法,就是selenium自动化测试框架模拟游览器从游览器页面中拿取数据。(由于能力有限,并没有**汉字识别验证码,这里用人工验证代替)
准备:
下载 selenium
下载Chromedriver(这里需要与自己的Chorme游览器版本相对应,我在下载的时候最新的,后面放出连接)
上代码
import com.nf.xiecheng.entyty.Hotel; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import java.awt.*; import java.awt.event.KeyEvent; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class SelectFlight { private final int MAX_X=2560;//用于滑块验证,电脑分辨率 private final int MAX_Y=1408; private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置 private final int TARGET_Y=MAX_Y-477; private static List<Hotel> hotelList = new ArrayList<Hotel>(); public static void main(String args[]) throws InterruptedException { SelectFlight s = new SelectFlight(); System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录 WebDriver webDriver = new ChromeDriver(); webDriver.get("https://hotels.ctrip.com"); Thread.sleep(1000); //跳转登陆页面 WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text"); login.click(); //登陆信息 Thread.sleep(1000); WebElement phone = webDriver.findElement(By.id("nloginname")); phone.sendKeys("13647610831"); WebElement passw = webDriver.findElement(By.id("npwd")); passw.sendKeys("a96968426"); //滑块验证 s.Robotcheck(); Thread.sleep(10000); //点击登陆 WebElement nsubmit = webDriver.findElement(By.id("nsubmit")); nsubmit.click(); //点击酒店搜索 Thread.sleep(2000); WebElement btnSearch = webDriver.findElement(By.id("btnSearch")); btnSearch.click(); Thread.sleep(5000); //进入主页 String pageSource = webDriver.getPageSource(); Thread.sleep(1000); WebElement nextPage = webDriver.findElement(By.id("downHerf")); nextPage.click(); //下一页 WebElement downHerfa = s.getNextPage(webDriver, "downHerf"); s.getHotelMassge(webDriver);//获取酒店信息 for (Hotel ph:hotelList ) { System.out.println(ph.toString()); } webDriver.close(); webDriver.quit(); } //获取下一页 public WebElement getNextPage(WebDriver webDriver,String nextPage){ WebElement nextWeb = webDriver.findElement(By.id(nextPage)); nextWeb.click(); return nextWeb; } //填入酒店信息 public void getHotelMassge(WebDriver webDriver){ WebElement hotel_list = webDriver.findElement(By.id("hotel_list")); List<WebElement> hotel_item = hotel_list.findElements(By.className("hotel_item")); System.err.println(hotel_item.size()); Iterator<WebElement> it = hotel_item.iterator(); while (it.hasNext()){ Hotel entry = new Hotel(); WebElement hotel = it.next(); //酒店名称 WebElement hotel_name = hotel.findElement(By.className("hotel_name")); WebElement a = hotel_name.findElement(By.tagName("a")); entry.setName(a.getAttribute("title")); //id String id = hotel_name.getAttribute("data-id"); entry.setId(id); //酒店地址 WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress")); List<WebElement> a_area = hotel_item_htladdress.findElements(By.tagName("a")); StringBuffer areabuffer = new StringBuffer(); for(int i = 0; i<a_area.size();i++){ areabuffer.append(a_area.get(i).getText()+","); if(i==a_area.size()-1){ areabuffer.append(a_area.get(i).getText()); } } entry.setArea(areabuffer.toString()); entry.setAddress(hotel_item_htladdress.getText()); //客户点评 WebElement hotelitem_judge_box = hotel.findElement(By.className("hotelitem_judge_box")); WebElement judge = hotelitem_judge_box.findElement(By.tagName("a")); entry.setEvaluate(judge.getAttribute("title")); //价钱 WebElement hotel_price_icon = hotel.findElement(By.className("hotel_price_icon")); WebElement j_price_lowList = hotel_price_icon.findElement(By.className("J_price_lowList")); entry.setLowprice(Double.parseDouble(j_price_lowList.getText())); hotelList.add(entry); } } //滑块验证 public void Robotcheck(){ //判断鼠标是否定位到指定位置 boolean x_ready = false; boolean y_ready = false; int x_move = 0; int y_move = 0; try { Robot robot = new Robot(); Point mousepoint = MouseInfo.getPointerInfo().getLocation(); System.out.println(mousepoint.x+" "+mousepoint.y); for(int i = 0;i<=MAX_X;i++){ robot.mouseMove(i,mousepoint.y); Thread.sleep(1); if(MAX_X - i == TARGET_X||TARGET_X + i == MAX_X){ x_ready = true; x_move = i; break; } } for(int j = 0; j<=MAX_Y;j++){ robot.mouseMove(x_move,j); Thread.sleep(1); if(MAX_Y - j == TARGET_Y || TARGET_Y + j == MAX_Y){ y_ready = true; y_move = j; break; } } robot.mouseMove(x_move,y_move); robot.mousePress(KeyEvent.BUTTON1_MASK); for(int i = 0; i < 300; i++){ Thread.sleep(5); robot.mouseMove(x_move+i,y_move); } robot.mouseRelease(KeyEvent.BUTTON1_MASK); } catch (AWTException e) { e.printStackTrace(); }catch (InterruptedException e) { e.printStackTrace(); } } }import com.nf.xiecheng.entyty.Hotel; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import java.awt.*; import java.awt.event.KeyEvent; import java.util.ArrayList; import java.util.Iterator; import java.util.List; public class SelectFlight { private final int MAX_X=2560;//用于滑块验证,电脑分辨率 private final int MAX_Y=1408; private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置 private final int TARGET_Y=MAX_Y-477; private static List<Hotel> hotelList = new ArrayList<Hotel>(); public static void main(String args[]) throws InterruptedException { SelectFlight s = new SelectFlight(); System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录 WebDriver webDriver = new ChromeDriver(); webDriver.get("https://hotels.ctrip.com"); Thread.sleep(1000); //跳转登陆页面 WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text"); login.click(); //登陆信息 Thread.sleep(1000); WebElement phone = webDriver.findElement(By.id("nloginname")); phone.sendKeys("13647610831"); WebElement passw = webDriver.findElement(By.id("npwd")); passw.sendKeys("a96968426"); //滑块验证 s.Robotcheck(); //睡眠10秒,用于人工验证汉字识别 Thread.sleep(10000); //点击登陆 WebElement nsubmit = webDriver.findElement(By.id("nsubmit")); nsubmit.click(); //点击酒店搜索 Thread.sleep(2000); WebElement btnSearch = webDriver.findElement(By.id("btnSearch")); btnSearch.click(); Thread.sleep(5000); //进入主页 String pageSource = webDriver.getPageSource(); Thread.sleep(1000); WebElement nextPage = webDriver.findElement(By.id("downHerf")); nextPage.click(); //下一页 WebElement downHerfa = s.getNextPage(webDriver, "downHerf"); s.getHotelMassge(webDriver);//获取酒店信息 for (Hotel ph:hotelList ) { System.out.println(ph.toString()); } webDriver.close(); webDriver.quit(); } //获取下一页 public WebElement getNextPage(WebDriver webDriver,String nextPage){ WebElement nextWeb = webDriver.findElement(By.id(nextPage)); nextWeb.click(); return nextWeb; } //填入酒店信息 public void getHotelMassge(WebDriver webDriver){ WebElement hotel_list = webDriver.findElement(By.id("hotel_list")); List<WebElement> hotel_item = hotel_list.findElements(By.className("hotel_item")); System.err.println(hotel_item.size()); Iterator<WebElement> it = hotel_item.iterator(); while (it.hasNext()){ Hotel entry = new Hotel(); WebElement hotel = it.next(); //酒店名称 WebElement hotel_name = hotel.findElement(By.className("hotel_name")); WebElement a = hotel_name.findElement(By.tagName("a")); entry.setName(a.getAttribute("title")); //id String id = hotel_name.getAttribute("data-id"); entry.setId(id); //酒店地址 WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress")); List<WebElement> a_area = hotel_item_htladdress.findElements(By.tagName("a")); StringBuffer areabuffer = new StringBuffer(); for(int i = 0; i<a_area.size();i++){ areabuffer.append(a_area.get(i).getText()+","); if(i==a_area.size()-1){ areabuffer.append(a_area.get(i).getText()); } } entry.setArea(areabuffer.toString()); entry.setAddress(hotel_item_htladdress.getText()); //客户点评 WebElement hotelitem_judge_box = hotel.findElement(By.className("hotelitem_judge_box")); WebElement judge = hotelitem_judge_box.findElement(By.tagName("a")); entry.setEvaluate(judge.getAttribute("title")); //价钱 WebElement hotel_price_icon = hotel.findElement(By.className("hotel_price_icon")); WebElement j_price_lowList = hotel_price_icon.findElement(By.className("J_price_lowList")); entry.setLowprice(Double.parseDouble(j_price_lowList.getText())); hotelList.add(entry); } } //滑块验证 public void Robotcheck(){ //判断鼠标是否定位到指定位置 boolean x_ready = false; boolean y_ready = false; int x_move = 0; int y_move = 0; try { Robot robot = new Robot(); Point mousepoint = MouseInfo.getPointerInfo().getLocation(); System.out.println(mousepoint.x+" "+mousepoint.y); for(int i = 0;i<=MAX_X;i++){ robot.mouseMove(i,mousepoint.y); Thread.sleep(1); if(MAX_X - i == TARGET_X||TARGET_X + i == MAX_X){ x_ready = true; x_move = i; break; } } for(int j = 0; j<=MAX_Y;j++){ robot.mouseMove(x_move,j); Thread.sleep(1); if(MAX_Y - j == TARGET_Y || TARGET_Y + j == MAX_Y){ y_ready = true; y_move = j; break; } } robot.mouseMove(x_move,y_move); robot.mousePress(KeyEvent.BUTTON1_MASK); for(int i = 0; i < 300; i++){ Thread.sleep(5); robot.mouseMove(x_move+i,y_move); } robot.mouseRelease(KeyEvent.BUTTON1_MASK); } catch (AWTException e) { e.printStackTrace(); }catch (InterruptedException e) { e.printStackTrace(); } } }