Skip to content

Commit

Permalink
refactor: 抽取 CatalogParser
Browse files Browse the repository at this point in the history
  • Loading branch information
freeok committed Mar 26, 2024
1 parent b040528 commit d1a48f4
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 30 deletions.
47 changes: 18 additions & 29 deletions src/main/java/com/pcdd/sonovel/core/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,16 @@
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Console;
import cn.hutool.core.util.NumberUtil;
import cn.hutool.core.util.URLUtil;
import cn.hutool.setting.dialect.Props;
import com.pcdd.sonovel.model.Book;
import com.pcdd.sonovel.model.Chapter;
import com.pcdd.sonovel.model.SearchResult;
import com.pcdd.sonovel.parse.BookParser;
import com.pcdd.sonovel.parse.CatalogParser;
import com.pcdd.sonovel.parse.ChapterParser;
import com.pcdd.sonovel.parse.SearchResultParser;
import com.pcdd.sonovel.util.Settings;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.BufferedOutputStream;
import java.io.File;
Expand Down Expand Up @@ -85,7 +82,7 @@ public static List<SearchResult> search(String keyword) {
}

/**
* 爬取小说 TODO 解耦
* 爬取小说
*
* @param list 搜索到的小说列表
* @param num 下载序号
Expand All @@ -95,48 +92,40 @@ public static List<SearchResult> search(String keyword) {
@SneakyThrows
public static double crawl(List<SearchResult> list, int num, int start, int end) {
SearchResult r = list.get(num);
String bookName = r.getBookName();
String author = r.getAuthor();
// 小说详情页url
String url = r.getUrl();
String bookName = r.getBookName();
String author = r.getAuthor();
Book book = new BookParser(SOURCE_ID).parse(url);

// 小说目录名格式:书名(作者)
bookDir = String.format("%s (%s)", bookName, author);
File dir = FileUtil.mkdir(new File(SAVE_PATH + File.separator + bookDir));
if (!dir.exists()) {
// C:\Program Files 下创建需要管理员权限
Console.log(render("@|red 创建下载目录失败,安装目录需要管理员权限|@"));
Console.log(render("@|red 创建下载目录失败,需要管理员权限|@"));
return 0;
}

Book book = new BookParser(SOURCE_ID).parse(url);
Document document = Jsoup.parse(URLUtil.url(url), 30_000);
// 获取小说目录 TODO 抽取为 CatalogParser
Elements elements = document.getElementById("list").getElementsByTag("a");
// 获取小说目录
List<Chapter> catalog = new CatalogParser(SOURCE_ID).parse(url, start, end);

int autoThreads = Runtime.getRuntime().availableProcessors() * 2;
// 线程池
// 创建线程池
ExecutorService executor = Executors.newFixedThreadPool(THREADS == -1 ? autoThreads : THREADS);
// 阻塞主线程,用于计时
CountDownLatch countDownLatch = new CountDownLatch(end == Integer.MAX_VALUE ? elements.size() : end);
CountDownLatch countDownLatch = new CountDownLatch(end == Integer.MAX_VALUE ? catalog.size() : end);

Console.log("<== 开始下载《{}》({}) 共计 {} 章 | 线程数:{}", bookName, author, elements.size(), autoThreads);
Console.log("<== 开始下载《{}》({}) 共计 {} 章 | 线程数:{}", bookName, author, catalog.size(), autoThreads);
StopWatch stopWatch = new StopWatch();
stopWatch.start();
ChapterParser chapterParser = new ChapterParser(SOURCE_ID);
// 爬取章节并下载
for (int i = start - 1; i < end && i < elements.size(); i++) {
int finalI = i;
executor.execute(() -> {
Chapter build = Chapter.builder()
.chapterNo(finalI + 1)
.title(elements.get(finalI).text())
.url(INDEX_URL + elements.get(finalI).attr("href"))
.build();
Chapter parse = chapterParser.parse(build, countDownLatch);
download(parse, countDownLatch);
countDownLatch.countDown();
});
}
catalog.forEach(item -> executor.execute(() -> {
Chapter chapter = chapterParser.parse(item, countDownLatch);
download(chapter, countDownLatch);
countDownLatch.countDown();
}));
// 等待全部下载完毕
countDownLatch.await();
executor.shutdown();
Expand All @@ -148,7 +137,7 @@ public static double crawl(List<SearchResult> list, int num, int start, int end)
}

/**
* 下载到本地
* 下载章节
*/
private static void download(Chapter chapter, CountDownLatch latch) {
// epub 格式转换前为 html
Expand Down
54 changes: 54 additions & 0 deletions src/main/java/com/pcdd/sonovel/parse/CatalogParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package com.pcdd.sonovel.parse;

import cn.hutool.core.util.URLUtil;
import cn.hutool.setting.dialect.Props;
import com.pcdd.sonovel.model.Chapter;
import com.pcdd.sonovel.util.Settings;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

/**
* @author pcdd
*/
public class CatalogParser extends Parser {

public static final String INDEX_URL;

// 加载配置文件参数
static {
Props sys = Settings.sys();
INDEX_URL = sys.getStr("index_url");
}

public CatalogParser(int sourceId) {
super(sourceId);
}

@SneakyThrows
public List<Chapter> parse(String url, int start, int end) {
Document document = Jsoup.parse(URLUtil.url(url), 30_000);
Elements elements = document.selectXpath(this.rule.getBook().getCatalog());
List<Chapter> catalog = new ArrayList<>();

for (int i = start - 1; i < end && i < elements.size(); i++) {
Chapter build = Chapter.builder()
.title(elements.get(i).text())
.url(INDEX_URL + elements.get(i).attr("href"))
.chapterNo(i + 1)
.build();
catalog.add(build);
}

return catalog;
}

public static void main(String[] args) {
new CatalogParser(1).parse("https://www.xbiqugu.info/66/66747/", 1, 50);
}

}
2 changes: 1 addition & 1 deletion src/main/resources/rule/rule1.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"latestChapter": "",
"latestUpdate": "",
"isEnd": "",
"catalog": "//*[@id=\"list\"]"
"catalog": "//*[@id=\"list\"]/dl/dd/a"
},
"chapter": {
"url": "https://www.xbiqugu.info/0/%s/%s.html",
Expand Down

0 comments on commit d1a48f4

Please sign in to comment.