Skip to content

Commit

Permalink
feat: update latest works
Browse files Browse the repository at this point in the history
  • Loading branch information
kongxiangyan committed Sep 18, 2021
1 parent 33c8ae3 commit 717cf14
Show file tree
Hide file tree
Showing 17 changed files with 1,207 additions and 18 deletions.
Binary file modified .hlint.yaml
Binary file not shown.
32 changes: 16 additions & 16 deletions .stylish-haskell.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,43 +57,43 @@ steps:
# - "," in fields is always aligned with "{"
# - "}" is likewise always aligned with "{"
#
# - records:
- records:
# # How to format equals sign between type constructor and data constructor.
# # Possible values:
# # - "same_line" -- leave "=" AND data constructor on the same line as the type constructor.
# # - "indent N" -- insert a new line and N spaces from the beginning of the next line.
# equals: "indent 2"
equals: "indent 2"
#
# # How to format first field of each record constructor.
# # Possible values:
# # - "same_line" -- "{" and first field goes on the same line as the data constructor.
# # - "indent N" -- insert a new line and N spaces from the beginning of the data constructor
# first_field: "indent 2"
first_field: "indent 2"
#
# # How many spaces to insert between the column with "," and the beginning of the comment in the next line.
# field_comment: 2
field_comment: 2
#
# # How many spaces to insert before "deriving" clause. Deriving clauses are always on separate lines.
# deriving: 2
deriving: 2
#
# # How many spaces to insert before "via" clause counted from indentation of deriving clause
# # Possible values:
# # - "same_line" -- "via" part goes on the same line as "deriving" keyword.
# # - "indent N" -- insert a new line and N spaces from the beginning of "deriving" keyword.
# via: "indent 2"
via: "indent 2"
#
# # Sort typeclass names in the "deriving" list alphabetically.
# sort_deriving: true
sort_deriving: true
#
# # Wheter or not to break enums onto several lines
# #
# # Default: false
# break_enums: false
break_enums: false
#
# # Whether or not to break single constructor data types before `=` sign
# #
# # Default: true
# break_single_constructors: true
break_single_constructors: true
#
# # Whether or not to curry constraints on function.
# #
Expand All @@ -102,7 +102,7 @@ steps:
# # Instead of @allValues :: (Enum a, Bounded a) => Proxy a -> [a]@
# #
# # Default: false
# curried_context: false
curried_context: true

# Align the right hand side of some elements. This is quite conservative
# and only applies to statements where each element occupies a single
Expand Down Expand Up @@ -240,7 +240,7 @@ steps:
# Useful for 'file' and 'group' align settings.
#
# Default: 4
list_padding: 4
list_padding: 2

# Separate lists option affects formatting of import list for type
# or class. The only difference is single space between type and list
Expand Down Expand Up @@ -330,18 +330,18 @@ steps:
language_prefix: LANGUAGE

# Replace tabs by spaces. This is disabled by default.
# - tabs:
# # Number of spaces to use for each tab. Default: 8, as specified by the
# # Haskell report.
# spaces: 8
- tabs:
# Number of spaces to use for each tab. Default: 8, as specified by the
# Haskell report.
spaces: 2

# Remove trailing whitespace
- trailing_whitespace: {}

# Squash multiple spaces between the left and right hand sides of some
# elements into single spaces. Basically, this undoes the effect of
# simple_align but is a bit less conservative.
# - squash: {}
# - squash: {}

# A common setting is the number of columns (parts of) code will be wrapped
# to. Different steps take this into account.
Expand Down
Binary file modified hie.yaml
Binary file not shown.
6 changes: 6 additions & 0 deletions packages/grappler/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# grappler

## 0.0.1.0

- init package

15 changes: 15 additions & 0 deletions packages/grappler/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
grappler
Copyright (C) 2021 Cigaret

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
12 changes: 12 additions & 0 deletions packages/grappler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# grappler

周期性检索「中华人民共和国中央人民政府」网站指定页面的数据条目,根据标题判断是否是新数据,将新数据写入目标数据库中。

监测页面如下:

- [首页 > 政策 > 最新](http://www.gov.cn/zhengce/zuixin.htm)
可能在域名解析层面设置了访问限制,直接访问 IP 地址可以不设代理拿到数据([IP 地址](http://182.18.80.137:80/zhengce/zuixin.htm))。
- [太原市人民政府 > 政府信息公开 > 法定主动公开内容 > 国民经济和社会发展规划](http://www.taiyuan.gov.cn/fzlm/gkmlpt/zdgk/index.shtml?chan=25)
很奇怪,这个没有访问限制,但数据是动态加载的,需要直接请求接口。
[IP 地址](http://221.204.12.122:80/fzlm/gkmlpt/zdgk/index.shtml?chan=25)
[数据地址](http://taiyuan.gov.cn/intertidwebapp/govChanInfo/getDocuments?Index=1&pageSize=20&siteId=1&ChannelType=1&KeyWord=&KeyWordType=&chanId=25&order=1)
3 changes: 3 additions & 0 deletions packages/grappler/Setup.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import Distribution.Simple
main = defaultMain

41 changes: 41 additions & 0 deletions packages/grappler/app/Gov/Taiyuan.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{-# LANGUAGE OverloadedStrings #-}

module Gov.Taiyuan where

import qualified Data.ByteString.Lazy.UTF8 as UTF8
import Data.Foldable
import Network.HTTP.Client
import Network.HTTP.Types.Header

type UrlString = String
type TargetUrls = [UrlString]

targetUrls :: TargetUrls
targetUrls = [
"http://taiyuan.gov.cn/intertidwebapp/govChanInfo/getDocuments?Index=1&pageSize=20&siteId=1&ChannelType=1&KeyWord=&KeyWordType=&chanId=25&order=1"
]

-- 添加通用的 Proxy 配置
setCommonProxy :: ManagerSettings -> ManagerSettings
setCommonProxy = managerSetProxy (useProxy Proxy {
proxyHost = "127.0.0.1",
proxyPort = 10809
})

-- 添加通用的 Manager 设置
setCommonManager :: ManagerSettings -> ManagerSettings
setCommonManager settings = settings { managerResponseTimeout = responseTimeoutMicro 30000000 }

prepareRequests = map (\url -> do
manager <- newManager $ setCommonManager defaultManagerSettings
initRequest <- parseRequest url
let request = initRequest {
method = "GET",
requestHeaders = [
-- 不设置 User-Agent 也可以拿到数据
(hUserAgent, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38")
]
}
response <- httpLbs request manager
return (url, request, response)
)
132 changes: 132 additions & 0 deletions packages/grappler/app/Gov/Top.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}

module Gov.Top
( grap
) where

import Control.Concurrent
import Control.Monad
import qualified Data.ByteString.Lazy.UTF8 as UTF8
import Data.Char
import Network.HTTP.Client
import Network.HTTP.Types.Header
import Text.HTML.TagSoup

type Name = String
type UrlString = String
data Target
= Target
{ name :: Name
, url :: UrlString
}
deriving (Show)
type Targets = [Target]

targets :: Targets
targets = [Target "Zhengce" "http://www.gov.cn/zhengce/zuixin.htm"]

-- 添加通用的 Proxy 配置
setCommonProxy :: ManagerSettings -> ManagerSettings
setCommonProxy = managerSetProxy (useProxy Proxy {
proxyHost = "127.0.0.1",
proxyPort = 10809
})

-- 添加通用的 Manager 设置
setCommonManager :: ManagerSettings -> ManagerSettings
setCommonManager settings = settings { managerResponseTimeout = responseTimeoutMicro (30 * 1000000) }

type RequestResponse = Response UTF8.ByteString
data RequestResult
= RequestResult
{ target :: Target
, request :: Request
, response :: RequestResponse
}
deriving (Show)

prepareRequests :: [Target] -> [IO RequestResult]
prepareRequests = map (\target -> do

manager <- newManager $ setCommonManager defaultManagerSettings

initRequest <- parseRequest $ url target
let request = initRequest {
method = "GET",
requestHeaders = [
-- 不设置 User-Agent 也可以拿到数据
(hUserAgent, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38")
]
}

response <- httpLbs request manager

return (RequestResult target request response)
)

execRequest :: IO [(Target, Request, RequestResponse, String)]
execRequest = do
responses <- sequenceA $ prepareRequests targets
traverse (\RequestResult { target, request, response } -> do
putStrLn $ "The target url was:" ++ url target
putStrLn $ "The request info was:" ++ show request
putStrLn $ "The response status was: " ++ show (responseStatus response)

let body = UTF8.toString $ responseBody response

writeFile ("./packages/grappler/data/" ++ "TopGov" ++ ".txt") body

putStrLn "Done"

return (target, request, response, body)) responses

parseBody :: String -> [Tag String]
parseBody = parseTags

extractList :: [Tag String] -> [[Tag String]]
extractList =
map (
concat .
(\tags -> [
(take 2 . dropWhile (~/= ("<a>" :: String))) tags,
(take 1 . drop 1 . dropWhile (~/= TagOpen "span" [("class" :: String,"date")])) tags
]) .
takeWhile (~/= ("</li>" :: String))
) .
sections (~== ("<h4>" :: String)) .
takeWhile (~/= TagOpen "span" [("class" :: String, "public_more")]) .
dropWhile (~/= TagOpen "div" [("class" :: String, "news_box")] )

newtype Item
= Item (String, String, String)
extractData :: [Tag String] -> Item
extractData tags = Item (title, url, date)
where
url = fromAttrib "href" . head $ tags
title = fromTagText (tags !! 1)
date = fromTagText (tags !! 2)

instance Show Item where
show (Item (title, url , date)) = "(" ++ title ++ ", " ++ url ++ ", " ++ date ++ ")"

formatData :: Item -> Item
formatData (Item (title, url, date)) = Item (formatTitle title, formatUrl url, formatDate date)
where
formatTitle = trim
-- 有的连接没有 domain,需要进行补全
formatUrl [] = []
formatUrl ('/':xs) = trim $ "http://www.gov.cn/" ++ xs
formatUrl u = trim u
formatDate = trim
trim = filter (not . isSpace)

grap :: IO ()
grap = forever $
forkIO (do
[(target, _, _, body)] <- execRequest
print target
let tags = map (formatData . extractData) . extractList $ parseBody body
print tags
writeFile "./packages/grappler/data/tags.tmp" (show tags))
>> threadDelay (60 * 10 * 1000000)
14 changes: 14 additions & 0 deletions packages/grappler/app/Main.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env stack
{-stack
script
--resolver lts-18.6
--package utf8-string
--package http-client,http-conduit,http-types
--package tagsoup
-}

import qualified Gov.Top as Top

main :: IO ()
main = do
Top.grap
Loading

0 comments on commit 717cf14

Please sign in to comment.