From d940ec6cc8d549d157a6a800f4c9d20be48482b0 Mon Sep 17 00:00:00 2001 From: yanlongqi Date: Mon, 14 Oct 2024 01:29:48 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=93=E5=8F=96=E8=A7=86=E9=A2=91=E6=92=AD?= =?UTF-8?q?=E6=94=BE=E6=BA=90=E5=9C=B0=E5=9D=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 25 ++++- .../video/models/entity/ClassifyInfo.java | 15 +++ .../video/models/entity/MadouVideoInfo.java | 2 + .../models/mapper/ClassifyInfoMapper.java | 7 ++ .../models/service/ClassifyInfoService.java | 10 ++ .../models/service/MadouVideoService.java | 102 +++++++++++++++++- .../yuchat/crawler/video/utils/HttpUtils.java | 29 +++++ src/main/resources/application.yml | 6 ++ 8 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 src/main/java/top/yuchat/crawler/video/models/entity/ClassifyInfo.java create mode 100644 src/main/java/top/yuchat/crawler/video/models/mapper/ClassifyInfoMapper.java create mode 100644 src/main/java/top/yuchat/crawler/video/models/service/ClassifyInfoService.java create mode 100644 src/main/java/top/yuchat/crawler/video/utils/HttpUtils.java create mode 100644 src/main/resources/application.yml diff --git a/pom.xml b/pom.xml index 6f0ca8c..f306ad3 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ org.springframework.boot - spring-boot-starter-web + spring-boot-starter @@ -39,6 +39,29 @@ 3.5.7 + + org.postgresql + postgresql + + + + org.apache.httpcomponents + httpclient + 4.5.14 + + + + org.jsoup + jsoup + 1.15.3 + + + + com.alibaba.fastjson2 + fastjson2 + 2.0.51 + + diff --git a/src/main/java/top/yuchat/crawler/video/models/entity/ClassifyInfo.java b/src/main/java/top/yuchat/crawler/video/models/entity/ClassifyInfo.java new file mode 100644 index 0000000..d277620 --- /dev/null +++ b/src/main/java/top/yuchat/crawler/video/models/entity/ClassifyInfo.java @@ -0,0 +1,15 @@ +package top.yuchat.crawler.video.models.entity; + +import lombok.Data; + +@Data +public class ClassifyInfo { + + private String id; + + private String name; + + private String url; + + private Integer pageSize; +} diff --git a/src/main/java/top/yuchat/crawler/video/models/entity/MadouVideoInfo.java b/src/main/java/top/yuchat/crawler/video/models/entity/MadouVideoInfo.java index cdae7f8..cf70cec 100644 --- a/src/main/java/top/yuchat/crawler/video/models/entity/MadouVideoInfo.java +++ b/src/main/java/top/yuchat/crawler/video/models/entity/MadouVideoInfo.java @@ -18,4 +18,6 @@ public class MadouVideoInfo { private String m3u8Url; + private Boolean m3u8; + } diff --git a/src/main/java/top/yuchat/crawler/video/models/mapper/ClassifyInfoMapper.java b/src/main/java/top/yuchat/crawler/video/models/mapper/ClassifyInfoMapper.java new file mode 100644 index 0000000..ddd4047 --- /dev/null +++ b/src/main/java/top/yuchat/crawler/video/models/mapper/ClassifyInfoMapper.java @@ -0,0 +1,7 @@ +package top.yuchat.crawler.video.models.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import top.yuchat.crawler.video.models.entity.ClassifyInfo; + +public interface ClassifyInfoMapper extends BaseMapper { +} diff --git a/src/main/java/top/yuchat/crawler/video/models/service/ClassifyInfoService.java b/src/main/java/top/yuchat/crawler/video/models/service/ClassifyInfoService.java new file mode 100644 index 0000000..8aa286f --- /dev/null +++ b/src/main/java/top/yuchat/crawler/video/models/service/ClassifyInfoService.java @@ -0,0 +1,10 @@ +package top.yuchat.crawler.video.models.service; + +import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import org.springframework.stereotype.Service; +import top.yuchat.crawler.video.models.entity.ClassifyInfo; +import top.yuchat.crawler.video.models.mapper.ClassifyInfoMapper; + +@Service +public class ClassifyInfoService extends ServiceImpl { +} diff --git a/src/main/java/top/yuchat/crawler/video/models/service/MadouVideoService.java b/src/main/java/top/yuchat/crawler/video/models/service/MadouVideoService.java index 1197913..f470779 100644 --- a/src/main/java/top/yuchat/crawler/video/models/service/MadouVideoService.java +++ b/src/main/java/top/yuchat/crawler/video/models/service/MadouVideoService.java @@ -1,11 +1,28 @@ package top.yuchat.crawler.video.models.service; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import jakarta.annotation.PostConstruct; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import org.springframework.stereotype.Service; +import top.yuchat.crawler.video.utils.HttpUtils; +import top.yuchat.crawler.video.models.entity.ClassifyInfo; import top.yuchat.crawler.video.models.entity.MadouVideoInfo; import top.yuchat.crawler.video.models.mapper.MadouVideoMapper; +import java.io.IOException; +import java.util.List; + +@Slf4j @Service +@RequiredArgsConstructor public class MadouVideoService extends ServiceImpl { /** @@ -13,9 +30,90 @@ public class MadouVideoService extends ServiceImpl * /index.php/vod/type/id/1/page/2.html - * + *

* https://yutujx.com/?url=https://t20a.cdn2020.com/video/m3u8/2022/10/23/cc234c9c/index.m3u8 */ + + public static final String BASE_URL = "https://madou.io"; + public static final String VIDEO_PACKAGE = BASE_URL + "/index.php/vod/type/id/{id}/page/{page_size}.html"; + + private final ClassifyInfoService classifyInfoService; + + + public void getVideoList() { + List list = classifyInfoService.list(); + int count = 0; + + for (ClassifyInfo classifyInfo : list) { + log.info("开始处理分类:{}", classifyInfo.getName()); + for (int i = 1; i <= classifyInfo.getPageSize(); i++) { + String url = VIDEO_PACKAGE.replace("{id}", classifyInfo.getId()).replace("{page_size}", String.valueOf(i)); + log.info("正在处理第{}页,页面地址:{}", i, url); + try { + saveVideo(url, classifyInfo.getName()); + count++; + } catch (Exception e) { + log.error("发送请求失败,跳过该页数据的爬取,URL:{}, 错误信息:{}", url, e.getMessage()); + } + } + + } + log.info("数据处理完成,共处理 {} 页数据", count); + } + + + public void saveVideo(String url, String classify) throws IOException { + String html = HttpUtils.get(url); + Document document = Jsoup.parse(html); + + Elements imgPs = document.getElementsByClass("img"); + for (Element imgP : imgPs) { + Element img = imgP.child(0); + + String title = img.attr("title"); + String coverUrl = img.attr("src"); + Element a = imgP.child(1); + String m3u8Url = a.attr("href"); + log.info("分类:{},标题:{},封面:{},播放地址:{}", classify, title, coverUrl, m3u8Url); + + MadouVideoInfo madouVideoInfo = new MadouVideoInfo(); + madouVideoInfo.setClassify(classify); + madouVideoInfo.setTitle(title); + madouVideoInfo.setCoverUrl(coverUrl); + madouVideoInfo.setM3u8Url(m3u8Url); + save(madouVideoInfo); + } + } + + @PostConstruct + public void processingData() { + QueryWrapper wrapper = new QueryWrapper<>(); + wrapper.eq("m3u8", false); + List list = list(wrapper); + log.info("开始处理数据,数据总量: {}", list.size()); + + for (MadouVideoInfo madouVideoInfo : list) { + String url = BASE_URL + madouVideoInfo.getM3u8Url(); + try { + String html = HttpUtils.get(url); + + Document document = Jsoup.parse(html); + Element bofangBox = document.getElementById("bofang_box"); + Element script = bofangBox.child(0); + String scriptStr = script.html(); + JSONObject videoInfo = JSONObject.parseObject(scriptStr.substring(scriptStr.indexOf("=") + 1)); + String m3u8Url = videoInfo.getString("url"); + log.info("标题: {}, 播放地址:{}", madouVideoInfo.getTitle(), m3u8Url); + madouVideoInfo.setM3u8Url(m3u8Url); + madouVideoInfo.setM3u8(true); + updateById(madouVideoInfo); + } catch (Exception e) { + log.error("处理失败,失败信息:{}", e.getMessage()); + } + } + + + } } diff --git a/src/main/java/top/yuchat/crawler/video/utils/HttpUtils.java b/src/main/java/top/yuchat/crawler/video/utils/HttpUtils.java new file mode 100644 index 0000000..7ea0fdd --- /dev/null +++ b/src/main/java/top/yuchat/crawler/video/utils/HttpUtils.java @@ -0,0 +1,29 @@ +package top.yuchat.crawler.video.utils; + +import lombok.extern.slf4j.Slf4j; +import org.apache.http.HttpEntity; +import org.apache.http.HttpStatus; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.util.EntityUtils; + +import java.io.IOException; + +@Slf4j +public class HttpUtils { + private static final HttpClient HTTP_CLIENT = new DefaultHttpClient(); + + public static String get(String url) throws IOException { + HttpGet httpGet = new HttpGet(url); + CloseableHttpResponse response = (CloseableHttpResponse) HTTP_CLIENT.execute(httpGet); + int statusCode = response.getStatusLine().getStatusCode(); + if (HttpStatus.SC_OK != statusCode) { + throw new RuntimeException("请求失败,状态码:" + statusCode + ",请求地址:" + url); + } + HttpEntity entity = response.getEntity(); + return EntityUtils.toString(entity); + } + +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..c4d96da --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,6 @@ +spring: + datasource: + driver-class-name: org.postgresql.Driver + url: jdbc:postgresql://pgsql.yuchat.top:5432/postgres + username: postgres + password: longqi@1314 \ No newline at end of file