-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdb_blog.sql
145 lines (104 loc) · 50.2 KB
/
db_blog.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
SQLyog Ultimate v11.33 (64 bit)
MySQL - 5.1.49-community : Database - db_blog
*********************************************************************
*/
/*!40101 SET NAMES utf8 */;
/*!40101 SET SQL_MODE=''*/;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
CREATE DATABASE /*!32312 IF NOT EXISTS*/`db_blog` /*!40100 DEFAULT CHARACTER SET utf8 */;
/*Table structure for table `t_blog` */
CREATE TABLE `t_blog` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(50) DEFAULT NULL,
`content` text,
`author` varchar(20) DEFAULT NULL,
`readNum` int(11) DEFAULT NULL,
`publishTime` datetime DEFAULT NULL,
`blogTypeId` int(11) DEFAULT NULL,
`blogTagId` int(11) DEFAULT NULL,
`isRecommend` int(5) DEFAULT NULL,
`recommendOrder` int(11) DEFAULT NULL,
`coverImageName` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `blogTypeId` (`blogTypeId`),
KEY `blogTagId` (`blogTagId`),
CONSTRAINT `t_blog_ibfk_1` FOREIGN KEY (`blogTypeId`) REFERENCES `t_blogtype` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=12 DEFAULT CHARSET=utf8;
/*Data for the table `t_blog` */
LOCK TABLES `t_blog` WRITE;
insert into `t_blog`(`id`,`title`,`content`,`author`,`readNum`,`publishTime`,`blogTypeId`,`blogTagId`,`isRecommend`,`recommendOrder`,`coverImageName`) values (1,'我的个人简介','<p><img src=\"/Blog/Blog/images/20160929/1475112326703050226.jpg\" title=\"1475112326703050226.jpg\" alt=\"未标题-1.jpg\" height=\"269\" width=\"401\"/></p><p>以上就是博主的照片啦</p><p>博主真实名字是:刘东宝,英文名:D.B,21岁了,广东揭阳揭西人</p><p>现在是华南师范大学职业教育学院的一名大三学生</p><p>爱好这方面嘛,当然是编程啦!</p><p>现阶段的技能树为:</p><p>前端 : Html5,CSS3,JavaScript,Ajax,JQuery,BootStrap,EasyUI,Android<br/></p><p>后端 : Jsp Servlet,Struts2,Spring,Hibernate,SpringMVC,MyBatis</p><p>数据库 : MySql,Sql Server</p><p>学无止境.....技能树还在不断壮大中!<img src=\"http://img.baidu.com/hi/jx2/j_0003.gif\" title=\"\" alt=\"\" height=\"50\" width=\"50\"/><img src=\"http://img.baidu.com/hi/jx2/j_0003.gif\"/><img src=\"http://img.baidu.com/hi/jx2/j_0003.gif\"/></p><p>梦想这方面嘛,梦想是一定要有的,万一实现了呢</p><p>现在的梦想是:毕业能进入BAT等知名互联网公司,现在看来还遥不可及哈!!</p><p><br/></p><p>以下是我的微信,交个朋友吧~</p><p><img src=\"/Blog/Blog/images/20160929/1475131982175078981.jpg\" title=\"1475131982175078981.jpg\" alt=\"1475131982175078981.jpg\" height=\"150\" width=\"150\"/><br/></p>','D.B | 氧化钡',130,'2016-09-29 00:29:10',5,6,1,3,'20160929150340.jpg'),(2,'本博客项目1.0版本分享','<p>什么?本项目开源??。。。对!!! 知识大家一起学习,共同进步嘛。现在这个博客是我花了一周多时间敲出来的,我主打的是后端开发,所以后端的所有都是我自己一行一行,踩了无数Bug敲出来的。另外,前端这么炫酷的界面当然不是我做的啦,我找的是网上的静态模板,直接套用的模板然后再删删改改。当然,模板也会分享出来的。</p><p>所涉及到的环境及技术知识如下,</p><p>IDE : eclipse</p><p>JDK : jdk1.8.0_51</p><p>Tomcat : apache-tomcat-7.0.63</p><p>前端 : HTML5,CSS3,JavaScript,Ajax,JQuery,EasyUI</p><p>后端 : Java语言,Spring4,SpringMVC,MyBatis</p><p>富文本编译器: UEditor(支持单图,多图,截图,涂鸦,视频,音乐,文件上传功能,强!)</p><p>数据库 : MySQL</p><p><br/></p><p>之后会继续分享该项目的升级版本并写上详细注释,以及我之前,以后敲的大大小小项目都会分享出来,大家敬请期待!</p><p><br/></p><p><br/></p><p>最后,</p><p>如果你觉得这项目对你的学习产生帮助,可以对博主小小的捐赠一把,你的支持,我的动力。微信收款:<br/><img src=\"/Blog/Blog/images/20160929/1475134599872083743.jpg\" title=\"1475134599872083743.jpg\" alt=\"1475134599872083743.jpg\" height=\"200\" width=\"200\"/></p><p><br/></p><p style=\"line-height: 16px;\"><img src=\"http://121.42.138.57/Blog/ueditor/dialogs/attachment/fileTypeImages/icon_rar.gif\"/><a style=\"font-size:12px; color:#0066cc;\" href=\"/Blog/Blog/files/20160929/1475138628228017290.zip\" title=\"Blog.zip\">Blog.zip</a></p><p style=\"line-height: 16px;\"><img src=\"http://121.42.138.57/Blog/ueditor/dialogs/attachment/fileTypeImages/icon_rar.gif\"/><a style=\"font-size:12px; color:#0066cc;\" href=\"/Blog/Blog/files/20160929/1475133614696024964.rar\" title=\"黑色质感时间轴套用帝国.rar\">黑色质感时间轴套用帝国.rar</a></p><p><br/></p>','D.B | 氧化钡',115,'2016-09-29 00:29:56',3,-1,1,1,'20160929150747.jpg'),(3,'关于服务器的选择及注意事项','<p><br/>这次服务器采用的是阿里的服务器,学生特价9.87元一个月,活动详情请下载阿里云服务器,域名是在万网买的www.dblearn.top,db向更高处学习,嘻嘻。由于购买域名需要在政府备案,需要大概30天左右,所以,大约10月份中旬大家就可以通过www.dblearn.top访问我的博客了。<br/></p><p><br/></p><p>下面说下注意事项,即我在部署时遇到的坑。项目在本地开发好后,在服务器需要配置相同的环境,然后把项目上传到服务器。注意一定要相同的环境! 这次我本地是JDK1.8,然而服务器装的是JDK1.7,同时数据库也少插入了一张表,所以折腾了好久都没部署成功。大家注意注意</p><p><br/></p><p><br/></p><p>阿里云网址:https://www.aliyun.com/<br/></p><p><br/></p><p><img src=\"/Blog/Blog/images/20160929/1475135565928083702.jpg\" title=\"1475135565928083702.jpg\" alt=\"mmexport1475135495315.jpg\"/></p>','D.B | 氧化钡',53,'2016-09-29 00:31:01',4,-1,1,2,'20160929155453.jpg'),(4,'转:用Java实现网易云音乐爬虫','<p>注:干货类博文均为转载。原文链接 : <a href=\"https://zhuanlan.zhihu.com/p/22698051\">https://zhuanlan.zhihu.com/p/22698051</a><br/></p><p><br/></p><p><br/></p><p><br/></p><h2>起因</h2><p>前两天在知乎上看到一个帖子《<a href=\"https://www.zhihu.com/question/31677442\" class=\"internal\">网易云音乐有哪些评论过万的歌曲?</a>》,一时技痒,用Java实现了一个简单的爬虫,这里简单记录一下。</p><p>最终的结果开放出来了,大家可以随意访问,请戳这里>>>>>> <a href=\"https://link.zhihu.com/?target=http%3A//grri94kmi4.app.tianmaying.com/songs\" class=\" wrap external\" target=\"_blank\">网易云音乐爬虫结果</a>。</p><p>完整的源代码戳右上角“<a href=\"https://link.zhihu.com/?target=https%3A//www.tianmaying.com/tutorial/music-163-crawler/repo\" class=\" wrap external\" target=\"_blank\">参考代码</a>”链接。</p><h2>爬虫简介</h2><p>网络爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本,一个通用的网络爬虫大致包含以下几个步骤:</p><p><img src=\"/Blog/Blog/images/20160930/1475199880653077325.png\" class=\"origin_image zh-lightbox-thumb\" width=\"793\"/></p><p>网络爬虫的大致流程如上图所示,无论你是做什么样的爬虫应用,整体流程都是大同小异。现在,我们就根据网易云音乐来定制一个专门爬取音乐评论数量的特定网络爬虫。</p><h2>前期准备</h2><h3>网页类型分析</h3><p>首先,我们需要对<a href=\"https://link.zhihu.com/?target=http%3A//music.163.com/\" class=\" wrap external\" target=\"_blank\">网易云音乐</a>整个网站有个大致的了解,进入<a href=\"https://link.zhihu.com/?target=http%3A//music.163.com/\" class=\" wrap external\" target=\"_blank\">网易云音乐首页</a>,浏览后发现其大概有这么几种类型的URL:</p><ul class=\" list-paddingleft-2\"><li><p>推荐页面</p></li><li><p>排行榜列表以及排行榜页面</p></li><li><p>歌单列表以及歌单页面</p></li><li><p>主播电台列表以及主播电台页面</p></li><li><p>歌手列表以及歌手页面</p></li><li><p>专辑列表(新碟上架)以及专辑页面</p></li><li><p>歌曲页面</p></li></ul><p>我们最终需要爬取的数据在歌曲页面中,该页面里包含了歌曲的名称以及歌曲的评论数量。</p><p>另外,我们还需要尽可能多的获取歌曲页面,这些信息我们可以从前面6种类型的页面拿到。其中,歌单列表以及歌单页面结构最简单,歌单列表直接分页就可以拿到。因此,我们选择歌单页面作为我们的初始页面,然后歌单列表--歌单--歌曲一路爬下去即可。</p><h3>设计数据模型</h3><p>通过上述分析,我们可以知道我们要做两件事情,一是爬取页面歌单列表--歌单--歌曲,二是将最终的结果存储起来。因此,我们只需要两个对象,一个用来存储页面相关的信息,url、页面类型、是否被爬过(html和title作为临时数据存储),另外一个用来存储歌曲相关信息,url、歌曲名,评论数。因此,model类如下:</p><pre class=\"brush:java;toolbar:false;\">public class WebPage {\n public enum PageType {\n song, playlist, playlists;\n }\n\n public enum Status {\n crawled, uncrawl;\n }\n\n private String url;\n private String title;\n private PageType type;\n private Status status;\n private String html;\n ...}</pre><pre class=\"brush:java;toolbar:false;\">public class Song {\n private String url;\n private String title;\n private Long commentCount;\n ...}</pre><h2>获取网页内容并解析</h2><p>根据之前的分析,我们需要爬的页面有三种:歌单列表、歌单以及歌曲。为了验证想法的可行性,我们先用代码来解析这三种类型的网页,我们将网页内容获取以及解析的代码都放入CrawlerThread当中。</p><h3>获取html</h3><p>无论想要从什么网站中拿到数据,获取其html代码都是最最基础的一步,这里我们使用jsoup来获取页面信息,在CrawlerThread中添加如下代码:</p><pre class=\"brush:java;toolbar:false;\">private boolean fetchHtml(WebPage webPage) throws IOException {\n Connection.Response response = Jsoup.connect(webPage.getUrl()).timeout(3000).execute();\n webPage.setHtml(response.body());\n return response.statusCode() / 100 == 2 ? true : false;}public static void main(String[] args) throws Exception {\n WebPage playlists = new WebPage("?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=0",PageType.playlists);\n CrawlerThread crawlerThread = new CrawlerThread();\n crawlerThread.fetchHtml(playlists);\n System.out.println(playlists.getHtml());}</pre><p>运行后即可看到html文本的输出</p><h3>解析歌单列表页面</h3><p>得到html后,我们来解析歌单列表,拿到页面中的所有歌单,Jsoup包含了html解析相关的功能,我们无需添加其他依赖,直接在CrawlerThread中添加如下代码:</p><pre class=\"brush:java;toolbar:false;\">private List<WebPage> parsePlaylist(WebPage webPage) {\n Elements songs = Jsoup.parse(webPage.getHtml()).select("ul.f-hide li a");\n return songs.stream().map(e -> new WebPage(BASE_URL + e.attr("href"), PageType.song, e.html())).collect(Collectors.toList());\n}\n\npublic static void main(String[] args) throws Exception {\n WebPage playlists = new WebPage("http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=0", PageType.playlists);\n CrawlerThread crawlerThread = new CrawlerThread();\n crawlerThread.fetchHtml(playlists);\n System.out.println(crawlerThread.parsePlaylists(playlists));\n}</pre><h3>解析歌单页面</h3><p>和歌单列表页面类似,只需要将歌曲相关的元素找出来即可:</p><pre class=\"brush:java;toolbar:false;\">private List<WebPage> parsePlaylist(WebPage webPage) {\n Elements songs = Jsoup.parse(webPage.getHtml()).select("ul.f-hide li a");\n return songs.stream().map(e -> new WebPage(BASE_URL + e.attr("href"), PageType.song, e.html())).collect(Collectors.toList());\n}\n\npublic static void main(String[] args) throws Exception {\n WebPage playlist = new WebPage("http://music.163.com/playlist?id=454016843", PageType.playlist);\n CrawlerThread crawlerThread = new CrawlerThread();\n crawlerThread.fetchHtml(playlist);\n System.out.println(crawlerThread.parsePlaylist(playlist));\n}</pre><p>注意,这里为了方便,我们将歌曲的名称也拿到了,这样后面我们就不需要再次获取歌曲名称了。</p><h3>解析歌曲页面</h3><p>终于到歌曲页面了,这里网易云音乐做了反爬处理,获取数据时的参数需要经过加密处理,这里我们不纠结于具体算法,如果有兴趣的直接看参考代码,我们只看关键代码:</p><pre class=\"brush:java;toolbar:false;\">private Song parseSong(WebPage webPage) throws Exception {\n return new Song(webPage.getUrl(), webPage.getTitle(), getCommentCount(webPage.getUrl().split("=")[1]));}public static void main(String[] args) throws Exception {\n WebPage song = new WebPage("http://music.163.com/song?id=29999506", PageType.song, "test");\n CrawlerThread crawlerThread = new CrawlerThread();\n crawlerThread.fetchHtml(song);\n System.out.println(crawlerThread.parseSong(song));}</pre><p>好吧,获取过程确实比较曲折,经过了多次的加密,不过不管怎么样,最终我们还是拿到了我们想要的数据。接下来,就是使用爬虫将整套机制run起来了。</p><h2>实现爬虫</h2><p>重新回顾一下流程图,我们发现其中有很重要的一个对象是爬虫队列,爬虫队列的实现方法有很多种,自己实现,mysql、redis、MongoDB等等都可以满足我们的需求,不同的选择会导致我们实现的不一致。</p><p><img src=\"/Blog/Blog/images/20160930/1475199880653077325.png\" class=\"origin_image zh-lightbox-thumb\" width=\"793\"/></p><p>综合考虑,我们使用Mysql+ Spring Data JPA + Spring MVC来跑我们的整套框架,最终还可以将爬下来的数据通过web服务展现出来。更深入地学习Spring MVC,请大家参考<a href=\"https://link.zhihu.com/?target=https%3A//course.tianmaying.com/spring-mvc\" class=\" wrap external\" target=\"_blank\">Spring MVC实战入门训练</a>。</p><p><br/></p><p>确定好之后,我们就可以开始一步步实现了。这里Spring Data JPA的代码就不展示了。了解Spring Data JPA,请参考<a href=\"https://link.zhihu.com/?target=https%3A//course.tianmaying.com/spring-data-jpa\" class=\" wrap external\" target=\"_blank\">Spring Data JPA实战入门训练</a>。直接上核心代码,所有和爬虫整体流程相关的代码我们都放进CrawlerService中。</p><h3>初始网址</h3><p>第一步建立一个初始网址,我们可以根据歌单列表分页的特征得到:</p><pre class=\"brush:java;toolbar:false;\">private void init(String catalog) {\n List<WebPage> webPages = Lists.newArrayList();\n for(int i = 0; i < 43; i++) {\n webPages.add(new WebPage("http://music.163.com/discover/playlist/?order=hot&cat=" + catalog + "&limit=35&offset=" + (i * 35), PageType.playlists));\n }\n webPageRepository.save(webPages);\n}\n\npublic void init() {\n webPageRepository.deleteAll();\n init("全部"); \n init("华语");\n init("欧美");\n init("日语");\n init("韩语");\n init("粤语");\n init("小语种");\n init("流行");\n init("摇滚");\n init("民谣");\n init("电子");\n init("舞曲");\n init("说唱");\n init("轻音乐");\n init("爵士");\n init("乡村");\n init("R&B/Soul");\n init("古典");\n init("民族");\n init("英伦");\n init("金属");\n init("朋克");\n init("蓝调");\n init("雷鬼");\n init("世界音乐");\n init("拉丁");\n init("另类/独立");\n init("New Age");\n init("古风");\n init("后摇");\n init("Bossa Nova");\n init("清晨");\n init("夜晚");\n init("学习");\n init("工作");\n init("午休");\n init("下午茶");\n init("地铁");\n init("驾车");\n init("运动");\n init("旅行");\n init("散步");\n init("酒吧");\n init("怀旧");\n init("清新");\n init("浪漫");\n init("性感");\n init("伤感");\n init("治愈");\n init("放松");\n init("孤独");\n init("感动");\n init("兴奋");\n init("快乐");\n init("安静");\n init("思念");\n init("影视原声");\n init("ACG");\n init("校园");\n init("游戏");\n init("70后");\n init("80后");\n init("90后");\n init("网络歌曲");\n init("KTV");\n init("经典");\n init("翻唱");\n init("吉他");\n init("钢琴");\n init("器乐");\n init("儿童");\n init("榜单");\n init("00后");\n}</pre><p>这里,我们初始化了歌单所有分类的列表,通过这些列表,我们就能拿到网易云音乐大部分的歌曲。</p><h3>从爬虫队列中拿到一个URL</h3><p>这里的逻辑非常简单,从mysql中获取一个状态为未爬的网页即可,但是由于我们需要爬的网址非常的多,肯定要用到多线程,因此需要考虑异步的情况:</p><pre class=\"brush:java;toolbar:false;\">public synchronized WebPage getUnCrawlPage() {\n WebPage webPage = webPageRepository.findTopByStatus(Status.uncrawl);\n webPage.setStatus(Status.crawled);\n return webPageRepository.save(webPage);}</pre><h3>爬取页面</h3><p>刚刚说到,我们需要爬取的页面很多,因此我们使用多线程的方式来运行我们的代码,首先我们来将CrawlThread改写成线程的方式,核心代码如下:</p><pre class=\"brush:java;toolbar:false;\">public class CrawlerThread implements Runnable {\n\n @Override\n public void run() {\n while (true) {\n WebPage webPage = crawlerService.getUnCrawlPage(); // TODO: 更好的退出机制\n if (webPage == null)\n return; // 拿不到url,说明没有需要爬的url,直接退出\n try {\n if (fetchHtml(webPage))\n parse(webPage);\n } catch (Exception e) {}\n }\n }}</pre><p>在CrawlerService中,我们还需要提供一个启动爬虫的入口:</p><pre class=\"brush:java;toolbar:false;\">public void crawl() throws InterruptedException {\n ExecutorService executorService = Executors.newFixedThreadPool(MAX_THREADS);\n for(int i = 0; i < MAX_THREADS; i++) {\n executorService.execute(new CrawlerThread(this));\n }\n executorService.shutdown();\n executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);\n Ehcache ehcache = cacheManager.getEhcache(cacheName);\n ehcache.removeAll();}</pre><p>这样,爬虫的所有核心代码就搞定了,先运行CrawlerService.init()方法初始化爬虫队列,之后运行CrawlerService.crawl()就能让我们的爬虫跑起来啦。</p><h2>提供WEB应用</h2><p>之前我们提到,我们还要使用Spring MVC,通过Spring MVC,我们就能很方便的提供爬虫管理的API啦。更深入地学习Spring MVC,请大家参考<a href=\"https://link.zhihu.com/?target=https%3A//course.tianmaying.com/spring-mvc\" class=\" wrap external\" target=\"_blank\">Spring MVC实战入门训练</a>。</p><pre class=\"brush:java;toolbar:false;\">@RestControllerpublic class CrawlerController {\n\n @Autowired\n private CrawlerService crawlerService;\n\n @Value("${auth.key}")\n private String key;\n\n @ModelAttribute\n public void AuthConfig(@RequestParam String auth) throws AccessException {\n if(!key.equals(auth)) {\n throw new AccessException("auth failed"); \n }\n }\n\n @GetMapping("/init")\n public void init() {\n crawlerService.init();\n }\n\n @GetMapping("/crawl")\n public void crawl() throws InterruptedException {\n crawlerService.crawl();\n }}</pre><p>启动后,按顺序访问<a href=\"https://link.zhihu.com/?target=http%3A//localhost%3A8080/init%26auth%3Dxxx\" class=\" external\" target=\"_blank\"><span class=\"invisible\">http://</span><span class=\"visible\">localhost:8080/init&aut</span><span class=\"invisible\">h=xxx</span><span class=\"ellipsis\"></span></a>和<a href=\"https://link.zhihu.com/?target=http%3A//localhost%3A8080/crawl%26auth%3Dxxx\" class=\" external\" target=\"_blank\"><span class=\"invisible\">http://</span><span class=\"visible\">localhost:8080/crawl&au</span><span class=\"invisible\">th=xxx</span><span class=\"ellipsis\"></span></a>即可,注意,这里的xxx是自己设置的密码,大家可以在application.properties里自行修改。</p><p>最后,我们将所有爬取到的音乐通过页面展示出来:</p><pre class=\"brush:java;toolbar:false;\">@Controllerpublic class SongController {\n\n @Autowired SongRepository songRepository;\n\n @GetMapping("/songs")\n public String songs(Model model,\n @PageableDefault(size = 100, sort = "commentCount", direction = Sort.Direction.DESC) Pageable pageable) {\n model.addAttribute("songs", songRepository.findAll(pageable));\n return "songs";\n }}</pre><p>这样,我们的整个爬虫就完成了,整个应用是通过Spring Boot运行的,感兴趣的话可以参考<a href=\"https://link.zhihu.com/?target=https%3A//www.tianmaying.com/tutorial/spring-boot-overview\" class=\" wrap external\" target=\"_blank\">Spring Boot——开发新一代Spring Java应用</a>。</p><h2>后续</h2><h3>爬取效率</h3><p>爬虫爬了两天后,一共爬到了573945条数据,此时数据库访问速度已经变成龟速... 事实证明,对于大型爬虫而言,这样简单粗暴的将数据库作为爬虫队列是不科学的,简单想了一下,我们可以用下列方式来优化爬虫的效率:</p><ul class=\" list-paddingleft-2\"><li><p>将webpage表分拆成playlist、album、song三张表,按照数据顺序先爬playlist,再爬album,最后再爬song(甚至将song拆成多张表)</p></li><li><p>由于网易云音乐的各种对象都有id,将id作为索引,提高mysql的效率</p></li><li><p>获取url的时候按照id从小到大获取,获取完一条删除一条</p></li><li><p>既然mysql达不到我们的要求,可以考虑直接将mysql替换掉,使用redis作为爬虫队列</p></li></ul><p>优化的方式有很多种,有些可以借助工具来实现,有些需要考虑具体的业务逻辑。这里我们不具体实现,感兴趣的同学可以自行实现,看看如何优化可以达到最大的效率。</p><h3>音乐页面访问效率</h3><p>数据量大了之后,影响的不仅仅是爬虫爬的效率,当然还有访问音乐列表的速度,随意访问一个页面都需要4秒左右。最后,我通过缓存解决了这个问题,具体实现我们也不多讲了,可以参考文章<a href=\"https://link.zhihu.com/?target=https%3A//www.tianmaying.com/tutorial/spring-web-ehcache\" class=\" wrap external\" target=\"_blank\">基于Spring的缓存</a>。加上缓存之后页面访问速度达到了100ms左右。</p><h3>数据更新</h3><p>除了爬虫的爬取效率外,还有一个很重要环节,就是数据的更新,评论数据是每天都会变化的,我们的数据当然也要每天更新。这里,我们使用最简单粗暴的方式,建立一个定时任务(有关定时任务可以参考<a href=\"https://link.zhihu.com/?target=https%3A//www.tianmaying.com/tutorial/spring-scheduling-task\" class=\" wrap external\" target=\"_blank\">基于Spring Boot的定时任务</a>),在每天的凌晨1点,找到评论数量大于5000的歌曲,将其状态设置为uncrawl(未爬),启动爬虫即可:</p><pre class=\"brush:java;toolbar:false;\">@GetMapping("/update")@Scheduled(cron = "0 1 0 * * ?")public void update() throws InterruptedException {\n crawlerService.update();}</pre><pre class=\"brush:java;toolbar:false;\">@Asyncpublic void update() throws InterruptedException {\n List<Song> webPages = songRepository.findByCommentCountGreaterThan(5000L);\n webPages.forEach(s -> {\n WebPage p = webPageRepository.findOne(s.getUrl());\n p.setStatus(Status.uncrawl);\n webPageRepository.save(p);\n });\n crawl();}</pre><p>整个站点是用Spring MVC假设的,学习Spring MVC,请大家参考和<a href=\"https://link.zhihu.com/?target=https%3A//course.tianmaying.com/spring-mvc\" class=\" wrap external\" target=\"_blank\">Spring MVC实战入门训练</a>和<a href=\"https://link.zhihu.com/?target=http%3A//www.tianmaying.com/tutorial/spring-mvc-quickstart\" class=\" wrap external\" target=\"_blank\">Spring MVC的入门实例</a>。</p><p><br/></p>','D.B | 氧化钡',30,'2016-09-30 09:44:47',2,7,1,4,'20160930095448.jpg'),(5,'new一个小笼包吧!','<p>小笼包调用的自动回复接口是图灵机器人,网址是:<a href=\"http://www.tuling123.com/\">http://www.tuling123.com/</a><br/></p><p>首先先注册</p><p><img src=\"http://121.42.138.57/Blog/ueditor/themes/default/images/spacer.gif\"/><img src=\"/Blog/Blog/images/20161001/1475290275821070189.png\" title=\"1475290275821070189.png\" alt=\"][6PY5A3[1IZFE}{)4KQH]7.png\"/></p><p>注册完后,进入个人中心--->我的机器人,会看到生成的APIKey,这个之后会用到<br/></p><p><img src=\"/Blog/Blog/images/20161001/1475290448419043507.png\" title=\"1475290448419043507.png\" alt=\"2KSKTPY{L66P%BD@$Q`GU1L.png\"/></p><p>点击查看详情,这里我们可以自定义我们机器人的功能,然后点击下载使用文档,</p><p><img src=\"/Blog/Blog/images/20161001/1475291125530006578.png\" title=\"1475291125530006578.png\" alt=\"26B70SFJL]GKCOXNXJ%ZPPO.png\"/></p><p><br/></p><p><img src=\"/Blog/Blog/images/20161001/1475291182954040867.png\" title=\"1475291182954040867.png\" alt=\"7WYSP$372_%KTC@G(LYJ`]1.png\"/></p><p>文档已经很详细了,之后就根据文档的提示来编写程序,下面给出一个小小的demo供大家参考。编程环境:</p><p>eclipse+tomcat7+JDK1.8</p><p style=\"line-height: 16px;\"><img src=\"http://121.42.138.57/Blog/ueditor/dialogs/attachment/fileTypeImages/icon_rar.gif\"/><a style=\"font-size:12px; color:#0066cc;\" href=\"/Blog/Blog/files/20161001/1475290897368035989.zip\" title=\"TuLing.zip\">TuLing.zip</a></p><p><br/></p><p><br/></p><p><br/></p><p>谢谢大家!<br/></p>','D.B | 氧化钡',42,'2016-10-01 11:01:52',1,1,1,1,'20161001110242.gif'),(6,'分享一本书:Java EE开发的颠覆者 SpringBoot实战','<p>引用书里的一句话:在当今Java EE开发中,Spring框架是当之无愧的王者,而Spring Boot是Spring主推的基于"习惯优于配置"的原则,让你能够快速搭建应用的框架,从而使得Java EE开发变得异常简单。</p><p><br/></p><p><a href=\"http://pan.baidu.com/s/1c2bsUAk\">http://pan.baidu.com/s/1c2bsUAk</a><br/></p><p>赶紧把这个技能收入囊中吧~<br/> </p>','D.B | 氧化钡',63,'2016-10-02 10:00:33',1,1,0,10,'20161002100445.jpg'),(7,'fasfd ','<p>sdaf dsf <br/></p>','D.B | 氧化钡',6,'2016-10-13 00:58:05',2,7,0,10,'s1.jpg'),(8,'测试代码','<p>24324</p>','D.B | 氧化钡',25,'2016-10-22 19:27:45',1,4,0,10,'s1.jpg'),(9,'Lessons Learned in Software Development','<h3 style=\"text-align: left; text-indent: 2em;\">Development</h3><p style=\"text-align: left; text-indent: 2em;\"><strong>1. Start small, then extend.</strong> Whether creating a new \nsystem, or adding a feature to an existing system, I always start by \nmaking a very simple version with almost none of the required \nfunctionality. Then I extend the solution step by step, until it does \nwhat it is supposed to. I have never been able to plan everything out in\n detail from the beginning. Instead, I learn as I go along, and this \nnewly discovered information gets used in the solution.</p><p style=\"text-align: left; text-indent: 2em;\">I like this quote from John Gall: <em> “A complex system that works is invariably found to have evolved from a simple system that worked.”</em></p><p style=\"text-align: left; text-indent: 2em;\"><strong>2. Change one thing at a time.</strong> When you develop, and\n some tests fail, or a feature stops working, it’s much easier to find \nthe problem if you only changed one thing. In other words, use short \niterations. Do one thing, make sure it works, repeat. This applies down \nto the level of commits. If you have to refactor the code before you add\n a new feature, commit the refactoring first, then (in a new commit) add\n the new feature.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>3. Add logging and error handling early.</strong> When \ndeveloping a new system, one of the first things I do is adding logging \nand error handling, because both are useful from the very beginning. For\n all systems that are bigger than a handful of lines of code, you need \nsome way of knowing what happens in the program. Perhaps not when it is \nworking as expected, but as soon as it doesn’t, you must be able to see \nwhat’s happening. The same goes for error handling – errors and \nexceptions happen in the beginning too, so the sooner you handle them in\n a systematic way, the better.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>4. All new lines must be executed at least once.</strong> Before\n you are done with a feature, you have to test it. Otherwise, how do you\n know that it does what it is supposed to do? Often, the best way is by \nautomatic tests, but not always. But no matter what, <em>every new line of code has to be executed at least once</em>.</p><p style=\"text-align: left; text-indent: 2em;\">Sometimes it can be hard to trigger the right conditions. \nFortunately, it’s easy to cheat a bit. For example, the error handling \non a database call can be checked by temporarily misspelling a column \nname. Or, an if-statement can be temporarily inverted (“if error” \nbecomes “if not error”) in order to trigger something that rarely \nhappens, just to make sure that code is run and does what it should.</p><p style=\"text-align: left; text-indent: 2em;\">Sometimes I see bugs that show that a certain line of code can never \nhave been run by the developer. It can look fine when reviewed, but \nstill not work. You avoid embarrassment if your policy is to always \nexecute every new line you write.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>5. Test the parts before the whole.</strong> Well-tested \nparts save time. Often there are problems with integrating different \nparts, for example from mismatched or misunderstood interfaces between \nmodules. If you can trust that the parts work as expected, it becomes \nmuch easier to track down the integration problems.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>6. Everything takes longer than you think.</strong> Especially\n in programming. It is hard to estimate how much time a feature will \ntake even if everything goes smoothly. But when developing software, it \nis quite common to run in to unexpected problems: a simple merge turns \nout to cause a subtle bug, an upgrade of a framework means some \nfunctions must be changed or an API call doesn’t work as promised.</p><p style=\"text-align: left; text-indent: 2em;\">I think there is a lot of truth in Hofstadter Law: <em>It always takes longer than you expect, even when you take into account Hofstadter’s Law.</em></p><p style=\"text-align: left; text-indent: 2em;\"><strong>7. First understand the existing code.</strong> Most coding \nrequires changing existing code in some way. Even if it is a new \nfeature, it needs to fit into the existing program. And before you can \nfit the new stuff in, you need to understand the current solution. \nOtherwise you may accidentally break some of the existing functionality.\n This is means that <em>reading </em>code is a skill that is as necessary as <em>writing </em>code.\n It is also part of the reason why seemingly small changes can still \ntake a long time – you must understand the context in which you make the\n change.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>8. Read and run.</strong> Fortunately, there are two \ncomplementary methods for understanding code. You can read the code, and\n you can run the code. Running the code can be a great help when \nfiguring out what it does. Be sure to make use of both methods.</p><h3 style=\"text-align: left; text-indent: 2em;\">Troubleshooting</h3><p style=\"text-align: left; text-indent: 2em;\"><strong>9. There will always be bugs.</strong> I don’t like \napproaches to software development that claim to “get it right the first\n time”. No matter how much effort you put in, there will always be bugs \n(the definition of a bug pretty much is: “we didn’t think of that”). A \nmuch better approach is to have a system in place that lets you quickly \ntroubleshoot problems, fix the bugs and deploy the fixes.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>10. Solve trouble reports.</strong> Every developer should \nspend a portion of their time handling trouble reports from customers \nand fixing bugs. It gives you a much better understanding of what the \ncustomers are trying to do, how the system is used, how easy or hard it \nis to troubleshoot and how well the system is designed. It’s also a \ngreat way of taking responsibility for what you develop. Don’t miss out \non all these benefits.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>11. Reproduce the problem.</strong> The first step when \nfixing a bug is to reproduce the problem. Then you make sure that when \nthe fix is added, the problem is gone. This simple rule makes sure you \nare not assuming something is a problem when it isn’t, and makes sure \nthe solution actually does what you think it does.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>12. Fix the known errors, then see what’s left.</strong> Sometimes\n there are several problems present that you know about. The different \nbugs can interact with each other and cause strange things to happen. \nInstead of trying to work out what happens in those cases, fix all the \nknow problems and then see what symptoms remain.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>13. Assume no coincidences.</strong> When testing and \ntroubleshooting, never believe in coincidences. You changed a timer \nvalue, and now the system restarts more often. Not a coincidence. A new \nfeature was added, and an unrelated feature becomes slower? Not a \ncoincidence. Instead, investigate.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>14. Correlate with timestamps.</strong> When troubleshooting,\n use the timestamp of events as a help. Look for even increments. For \nexample, if the system restarted, and a request was sent out around 3000\n milliseconds before, maybe a timer triggered the action that lead to \nthe restart.</p><h3 style=\"text-align: left; text-indent: 2em;\">Cooperation</h3><p style=\"text-align: left; text-indent: 2em;\"><strong>15. Face to face has the highest bandwidth.</strong> When \ndiscussing how to solve a problem, being face to face beats video, call,\n chat and email. I am often amazed at how much better the solutions are \nafter discussing them in person with colleagues.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>16. Rubber ducking.</strong> Whenever you are stuck, go to a \ncolleague and explain the problem to them. Many times, as you talk, you \nrealize what the problem is, even if your colleague doesn’t say a word. \nSounds like magic, but works surprisingly often.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>17. Ask.</strong> Reading and running the code is often great\n for figuring out what it does and how it works. But if you have the \npossibility to ask someone knowledgeable (perhaps the original author), \nuse that option too. Being able to ask specific questions, and follow-up\n questions to those, can give you information in minutes that would \notherwise take days to get.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>18. Share credit.</strong> Make sure to give credit where \ncredit is due. Say: “Marcus came up with the idea to try…” (if he did), \ninstead of “we tried …”. Go out of your way to mention who else helped \nor contributed.</p><h3 style=\"text-align: left; text-indent: 2em;\">Miscellaneous</h3><p style=\"text-align: left; text-indent: 2em;\"><strong>19. Try it.</strong> If you are unsure of how a certain \nlanguage feature works, it is easy to write a little program that shows \nhow it works. The same applies when testing the system you are \ndeveloping. What happens if I set this parameter to -1? What happens if \nthis service is down when I reboot the system? Explore how it works – \nfiddling around often reveals bugs, and at the same time it deepens your\n understanding of how the system works.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>20. Sleep on it.</strong> If you are working on a difficult \nproblem, try to get in a night’s sleep before you decide. Then your \nsubconscious mind works on the problem even when you aren’t actively \nthinking about it. As a result, the solution can seem obvious the next \nday.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>21. Change.</strong> Don’t be afraid to change roles or jobs \nevery once in a while. It is stimulating to work with different people, \non a different product or in a different company. In my view, too many \npeople just passively stay at the same job year after year, only \nchanging if they are forced to.</p><p style=\"text-align: left; text-indent: 2em;\"><strong>22. Keep learning.</strong> One of the great things with \nsoftware development is that there is always room to learn and know \nmore. Try out different programming languages and tools, read books on \nsoftware development, take MOOC courses. Small improvements soon add up \nto make a real difference in your knowledge and abilities.</p>','D.B | 氧化钡',46,'2016-10-23 11:16:21',4,-1,0,10,'s1.jpg'),(10,'222','<p>2222</p>','D.B | 氧化钡',6,'2016-11-02 10:50:59',1,1,0,10,'20161102201227.jpg'),(11,'23423','<p>武器二<br/></p>','D.B | 氧化钡',2,'2016-11-02 10:52:33',1,1,1,1,'20161103161729.jpg');
UNLOCK TABLES;
/*Table structure for table `t_blogadvice` */
CREATE TABLE `t_blogadvice` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`nickName` varchar(20) DEFAULT NULL,
`userIP` varchar(20) DEFAULT NULL,
`content` varchar(1000) DEFAULT NULL,
`publishTime` datetime DEFAULT NULL,
`reply` varchar(1000) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=17 DEFAULT CHARSET=utf8;
/*Data for the table `t_blogadvice` */
LOCK TABLES `t_blogadvice` WRITE;
insert into `t_blogadvice`(`id`,`nickName`,`userIP`,`content`,`publishTime`,`reply`) values (1,'elic','112.96.145.132','把东宝师兄拖出去打靶','2016-09-29 01:00:36','手动可怜脸~'),(2,'lin','183.63.97.163','有时间交流交流','2016-09-29 01:32:50','嗯嗯,好哇'),(3,'小宝','112.96.128.128','D.B是逗b的缩写吗?哈哈,越来越6了','2016-09-29 07:50:22','哈哈,是东宝的缩写啦,小宝好久不见啊!'),(4,'+萌','120.236.177.115','66666........','2016-09-29 08:06:33','哈哈,还不是受你这个大神的影响'),(5,'湘萍','223.104.1.106','早安','2016-09-29 08:19:31','早啊'),(6,'胖子','112.96.170.123',' 东宝大神!','2016-09-29 09:15:50','哈哈谢谢!'),(7,'传输媒体','219.128.252.233',' 请客 请客','2016-09-29 10:28:48','鉴定完毕,此人是隔壁老王'),(8,'叶叶耶!','112.96.97.111','大神,教我做一个这样的页面','2016-09-29 11:39:20','网站源码我会分享出来的'),(9,'Miss Z','112.96.170.100','厉害的大神~继续加油哈,到时别忘了指点指点我等无名小卒哈哈哈','2016-09-29 11:45:18','哈,可以啊'),(10,'那日阳光明媚','183.62.181.6','膜拜大神','2016-09-29 12:04:38','嘻嘻'),(11,'昵称','183.62.181.6','点开播放音乐 ,进去另外一个导航页面,音乐就停止了。','2016-09-29 12:10:27','嗯嗯,谢谢你的建议,我会修复这个Bug的'),(12,'斯里兰卡lo','112.96.170.155','宝大师中午好!\n\n','2016-09-29 13:35:48','诶,你好呀~~'),(13,'风','58.248.192.84','东宝好牛逼啊,大神!','2016-09-29 15:05:22','谢谢,嘻嘻'),(14,'海棠_小鱼','183.62.180.9','滴~打卡!以后每天多一个任务:追随大神的脚步!哈哈~','2016-09-29 17:28:27','哎呀,怪不好意思的'),(15,'机器人','183.62.180.16','东宝好厉害。。','2016-09-30 11:41:23','~~~~'),(16,'海棠_小鱼','183.62.180.9','滴~打卡!国庆快乐~22','2016-09-30 23:52:44','同乐同乐~22');
UNLOCK TABLES;
/*Table structure for table `t_bloger` */
CREATE TABLE `t_bloger` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`userName` varchar(20) DEFAULT NULL,
`password` varchar(20) DEFAULT NULL,
`nickName` varchar(20) DEFAULT NULL,
`job` varchar(20) DEFAULT NULL,
`hobby` varchar(20) DEFAULT NULL,
`email` varchar(20) DEFAULT NULL,
`webClick` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;
/*Data for the table `t_bloger` */
LOCK TABLES `t_bloger` WRITE;
insert into `t_bloger`(`id`,`userName`,`password`,`nickName`,`job`,`hobby`,`email`,`webClick`) values (1,'ldb','123456','D.B | 氧化钡','学生、未来的程序猿','瞎折腾一些东西','[email protected]',457);
UNLOCK TABLES;
/*Table structure for table `t_blogtag` */
CREATE TABLE `t_blogtag` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`tagName` varchar(20) DEFAULT NULL,
`typeId` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `typeId` (`typeId`),
CONSTRAINT `t_blogtag_ibfk_1` FOREIGN KEY (`typeId`) REFERENCES `t_blogtype` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=8 DEFAULT CHARSET=utf8;
/*Data for the table `t_blogtag` */
LOCK TABLES `t_blogtag` WRITE;
insert into `t_blogtag`(`id`,`tagName`,`typeId`) values (1,'Java',1),(2,'SSM框架',1),(3,'S2SH框架',1),(4,'JavaScript',1),(5,'前端知识',1),(6,'生活感悟',5),(7,'知乎干货',1);
UNLOCK TABLES;
/*Table structure for table `t_blogtype` */
CREATE TABLE `t_blogtype` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`typeName` varchar(20) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=utf8;
/*Data for the table `t_blogtype` */
LOCK TABLES `t_blogtype` WRITE;
insert into `t_blogtype`(`id`,`typeName`) values (1,'技术探讨'),(2,'技术干货'),(3,'项目分享'),(4,'经验总结'),(5,'碎言碎语');
UNLOCK TABLES;
/*Table structure for table `t_link` */
CREATE TABLE `t_link` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`linkName` varchar(20) DEFAULT NULL,
`linkUrl` varchar(100) DEFAULT NULL,
`linkEmail` varchar(50) DEFAULT NULL,
`orderNum` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8;
/*Data for the table `t_link` */
LOCK TABLES `t_link` WRITE;
insert into `t_link`(`id`,`linkName`,`linkUrl`,`linkEmail`,`orderNum`) values (1,'知乎','http://www.zhihu.com','[email protected]',1),(2,'GitHub','https://github.com/','[email protected]',2),(3,'华师教务网','http://jwc.scnu.edu.cn/','[email protected]',3),(4,'华师图书馆2','http://202.116.41.246:8080/opac/search.php','[email protected]',4);
UNLOCK TABLES;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;