generated from we-mobius/haskell-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
33c8ae3
commit 717cf14
Showing
17 changed files
with
1,207 additions
and
18 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# grappler | ||
|
||
## 0.0.1.0 | ||
|
||
- init package | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
grappler | ||
Copyright (C) 2021 Cigaret | ||
|
||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see <http://www.gnu.org/licenses/>. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# grappler | ||
|
||
周期性检索「中华人民共和国中央人民政府」网站指定页面的数据条目,根据标题判断是否是新数据,将新数据写入目标数据库中。 | ||
|
||
监测页面如下: | ||
|
||
- [首页 > 政策 > 最新](http://www.gov.cn/zhengce/zuixin.htm) | ||
可能在域名解析层面设置了访问限制,直接访问 IP 地址可以不设代理拿到数据([IP 地址](http://182.18.80.137:80/zhengce/zuixin.htm))。 | ||
- [太原市人民政府 > 政府信息公开 > 法定主动公开内容 > 国民经济和社会发展规划](http://www.taiyuan.gov.cn/fzlm/gkmlpt/zdgk/index.shtml?chan=25) | ||
很奇怪,这个没有访问限制,但数据是动态加载的,需要直接请求接口。 | ||
[IP 地址](http://221.204.12.122:80/fzlm/gkmlpt/zdgk/index.shtml?chan=25)。 | ||
[数据地址](http://taiyuan.gov.cn/intertidwebapp/govChanInfo/getDocuments?Index=1&pageSize=20&siteId=1&ChannelType=1&KeyWord=&KeyWordType=&chanId=25&order=1)。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import Distribution.Simple | ||
main = defaultMain | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
|
||
module Gov.Taiyuan where | ||
|
||
import qualified Data.ByteString.Lazy.UTF8 as UTF8 | ||
import Data.Foldable | ||
import Network.HTTP.Client | ||
import Network.HTTP.Types.Header | ||
|
||
type UrlString = String | ||
type TargetUrls = [UrlString] | ||
|
||
targetUrls :: TargetUrls | ||
targetUrls = [ | ||
"http://taiyuan.gov.cn/intertidwebapp/govChanInfo/getDocuments?Index=1&pageSize=20&siteId=1&ChannelType=1&KeyWord=&KeyWordType=&chanId=25&order=1" | ||
] | ||
|
||
-- 添加通用的 Proxy 配置 | ||
setCommonProxy :: ManagerSettings -> ManagerSettings | ||
setCommonProxy = managerSetProxy (useProxy Proxy { | ||
proxyHost = "127.0.0.1", | ||
proxyPort = 10809 | ||
}) | ||
|
||
-- 添加通用的 Manager 设置 | ||
setCommonManager :: ManagerSettings -> ManagerSettings | ||
setCommonManager settings = settings { managerResponseTimeout = responseTimeoutMicro 30000000 } | ||
|
||
prepareRequests = map (\url -> do | ||
manager <- newManager $ setCommonManager defaultManagerSettings | ||
initRequest <- parseRequest url | ||
let request = initRequest { | ||
method = "GET", | ||
requestHeaders = [ | ||
-- 不设置 User-Agent 也可以拿到数据 | ||
(hUserAgent, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38") | ||
] | ||
} | ||
response <- httpLbs request manager | ||
return (url, request, response) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
{-# LANGUAGE NamedFieldPuns #-} | ||
{-# LANGUAGE OverloadedStrings #-} | ||
|
||
module Gov.Top | ||
( grap | ||
) where | ||
|
||
import Control.Concurrent | ||
import Control.Monad | ||
import qualified Data.ByteString.Lazy.UTF8 as UTF8 | ||
import Data.Char | ||
import Network.HTTP.Client | ||
import Network.HTTP.Types.Header | ||
import Text.HTML.TagSoup | ||
|
||
type Name = String | ||
type UrlString = String | ||
data Target | ||
= Target | ||
{ name :: Name | ||
, url :: UrlString | ||
} | ||
deriving (Show) | ||
type Targets = [Target] | ||
|
||
targets :: Targets | ||
targets = [Target "Zhengce" "http://www.gov.cn/zhengce/zuixin.htm"] | ||
|
||
-- 添加通用的 Proxy 配置 | ||
setCommonProxy :: ManagerSettings -> ManagerSettings | ||
setCommonProxy = managerSetProxy (useProxy Proxy { | ||
proxyHost = "127.0.0.1", | ||
proxyPort = 10809 | ||
}) | ||
|
||
-- 添加通用的 Manager 设置 | ||
setCommonManager :: ManagerSettings -> ManagerSettings | ||
setCommonManager settings = settings { managerResponseTimeout = responseTimeoutMicro (30 * 1000000) } | ||
|
||
type RequestResponse = Response UTF8.ByteString | ||
data RequestResult | ||
= RequestResult | ||
{ target :: Target | ||
, request :: Request | ||
, response :: RequestResponse | ||
} | ||
deriving (Show) | ||
|
||
prepareRequests :: [Target] -> [IO RequestResult] | ||
prepareRequests = map (\target -> do | ||
|
||
manager <- newManager $ setCommonManager defaultManagerSettings | ||
|
||
initRequest <- parseRequest $ url target | ||
let request = initRequest { | ||
method = "GET", | ||
requestHeaders = [ | ||
-- 不设置 User-Agent 也可以拿到数据 | ||
(hUserAgent, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38") | ||
] | ||
} | ||
|
||
response <- httpLbs request manager | ||
|
||
return (RequestResult target request response) | ||
) | ||
|
||
execRequest :: IO [(Target, Request, RequestResponse, String)] | ||
execRequest = do | ||
responses <- sequenceA $ prepareRequests targets | ||
traverse (\RequestResult { target, request, response } -> do | ||
putStrLn $ "The target url was:" ++ url target | ||
putStrLn $ "The request info was:" ++ show request | ||
putStrLn $ "The response status was: " ++ show (responseStatus response) | ||
|
||
let body = UTF8.toString $ responseBody response | ||
|
||
writeFile ("./packages/grappler/data/" ++ "TopGov" ++ ".txt") body | ||
|
||
putStrLn "Done" | ||
|
||
return (target, request, response, body)) responses | ||
|
||
parseBody :: String -> [Tag String] | ||
parseBody = parseTags | ||
|
||
extractList :: [Tag String] -> [[Tag String]] | ||
extractList = | ||
map ( | ||
concat . | ||
(\tags -> [ | ||
(take 2 . dropWhile (~/= ("<a>" :: String))) tags, | ||
(take 1 . drop 1 . dropWhile (~/= TagOpen "span" [("class" :: String,"date")])) tags | ||
]) . | ||
takeWhile (~/= ("</li>" :: String)) | ||
) . | ||
sections (~== ("<h4>" :: String)) . | ||
takeWhile (~/= TagOpen "span" [("class" :: String, "public_more")]) . | ||
dropWhile (~/= TagOpen "div" [("class" :: String, "news_box")] ) | ||
|
||
newtype Item | ||
= Item (String, String, String) | ||
extractData :: [Tag String] -> Item | ||
extractData tags = Item (title, url, date) | ||
where | ||
url = fromAttrib "href" . head $ tags | ||
title = fromTagText (tags !! 1) | ||
date = fromTagText (tags !! 2) | ||
|
||
instance Show Item where | ||
show (Item (title, url , date)) = "(" ++ title ++ ", " ++ url ++ ", " ++ date ++ ")" | ||
|
||
formatData :: Item -> Item | ||
formatData (Item (title, url, date)) = Item (formatTitle title, formatUrl url, formatDate date) | ||
where | ||
formatTitle = trim | ||
-- 有的连接没有 domain,需要进行补全 | ||
formatUrl [] = [] | ||
formatUrl ('/':xs) = trim $ "http://www.gov.cn/" ++ xs | ||
formatUrl u = trim u | ||
formatDate = trim | ||
trim = filter (not . isSpace) | ||
|
||
grap :: IO () | ||
grap = forever $ | ||
forkIO (do | ||
[(target, _, _, body)] <- execRequest | ||
print target | ||
let tags = map (formatData . extractData) . extractList $ parseBody body | ||
print tags | ||
writeFile "./packages/grappler/data/tags.tmp" (show tags)) | ||
>> threadDelay (60 * 10 * 1000000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env stack | ||
{-stack | ||
script | ||
--resolver lts-18.6 | ||
--package utf8-string | ||
--package http-client,http-conduit,http-types | ||
--package tagsoup | ||
-} | ||
|
||
import qualified Gov.Top as Top | ||
|
||
main :: IO () | ||
main = do | ||
Top.grap |
Oops, something went wrong.