视频教程1: 狂神说Java Jsoup爬虫入门实战 https://www.bilibili.com/video/BV1La4y1x7Wm?vd_source=aee5e475191b69e6c781059ab6662584
视频教程2:https://www.bilibili.com/video/BV1RU4y147eZ?vd_source=aee5e475191b69e6c781059ab6662584
具体的看视频 急速入门
入门实战教程
1.引入依赖
1 2 3 4 5 6
| <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.3</version> </dependency>
|
2.编写测试代码
下面我们以爬取京东上的商品的图片和价格为例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| package com.jason;
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.IOException; import java.net.URL;
public class HtmlParseUtil { public static void main(String[] args) throws IOException { HtmlParseUtil.parseJd("联想电脑"); }
public static void parseJd(String keywords) throws IOException { String url = "https://search.jd.com/Search?keyword=" + keywords; Document document = Jsoup.parse(new URL(url), 30000); Element element = document.getElementById("J_goodsList"); Elements elements = element.getElementsByTag("li"); for (Element el : elements) { String attr = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String title = el.getElementsByClass("p-name").eq(0).text(); String price = el.getElementsByClass("p-price").eq(0).text(); System.out.println("图片地址:https:" + attr); System.out.println("商品名称:" + title); System.out.println("商品价格:" + price); } } }
|
3.运行测试
示例
爬取房天下上面的房价信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| package com.jason;
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.IOException; import java.net.URL;
public class ParseHouse { public static void main(String[] args) throws IOException { int i = 91; while (true){ String url = "https://newhouse.fang.com/house/s/b"+i; System.out.println("正在爬取第"+i+"页的数据....."); ParseHouse.selectHouses(url); i++; }
}
public static void selectHouses(String url) throws IOException { Document document = Jsoup.parse(new URL(url), 30000); Element element = document.getElementById("newhouse_loupan_list"); Elements elements = element.getElementsByTag("li"); for (Element el : elements) { String images = el.getElementsByTag("img").eq(1).attr("src"); String alt = el.getElementsByTag("img").eq(1).attr("alt"); String price = el.getElementsByClass("nhouse_price").text(); System.out.println("图片:https:"+images); System.out.println("名称:"+alt); System.out.println("房价:"+price); } } }
|
爬取豆瓣电影排行榜的信息
爬取全国各地房价的信息到excel表格中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
| package com.jason;
import com.alibaba.excel.EasyExcel; import com.jason.entity.HorseInfo; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.File; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List;
public class HousePriseList { public static void main(String[] args) throws IOException { String fileName = "C:\\AlYun\\work\\housePriceList.xlsx"; String temp = "C:\\AlYun\\work\\temp.xlsx"; int i = 1; while (true) { try { String url = "https://fangjia.gotohui.com/top/" + i + ".html"; List<HorseInfo> horseInfos = HousePriseList.selectHorse(url); File file = new File(fileName); File tempFile = new File(temp); if (file.exists()) { EasyExcel.write(file, HorseInfo.class).needHead(false). withTemplate(file).file(tempFile).sheet().doWrite(horseInfos); } else { EasyExcel.write(file, HorseInfo.class).sheet().doWrite(horseInfos); } if (tempFile.exists()) { file.delete(); tempFile.renameTo(file); } i++; Thread.sleep(2000); } catch (Exception e) { e.printStackTrace(); break; } } }
public static List<HorseInfo> selectHorse(String url) throws IOException { Document document = Jsoup.parse(new URL(url), 30000); Elements elements = document.getElementsByClass("ntable table-striped table-hover"); ArrayList<HorseInfo> horseInfos = new ArrayList<>(); for (Element element : elements) { Elements tr = element.getElementsByTag("tr"); for (Element el : tr) { Elements td = el.getElementsByTag("td"); System.out.println("-----------------------------------------"); System.out.println("编号:" + td.eq(0).text()); System.out.println("城市:" + td.eq(1).text()); System.out.println("单价(元/㎡):" + td.eq(2).text()); System.out.println("同比(去年):" + td.eq(3).text()); System.out.println("环比(上月):" + td.eq(4).text()); System.out.println("收入比:" + td.eq(5).text()); if (!td.eq(0).text().equals("")) { horseInfos.add(new HorseInfo( td.eq(0).text(), td.eq(1).text(), td.eq(2).text(), td.eq(3).text(), td.eq(4).text(), td.eq(5).text() )); }
} } return horseInfos; } }
|