-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
112 lines (92 loc) · 2.71 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// This file is part of GoCrawl.
//
// GoCrawl is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the
// Free Software Foundation, either version 3 of the License, or (at your
// option) any later version.
//
// GoCrawl is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with GoCrawl. If not, see <https://www.gnu.org/licenses/>.
package main
import (
"context"
"database/sql"
"fmt"
"github.com/namsral/flag"
"io"
"log"
"os"
"time"
_ "github.com/mattn/go-sqlite3"
"gocrawl/internal/crawler"
"gocrawl/internal/graph"
)
type Config struct {
UserAgent string
ContentType string
Url string
MaxWorkers int
}
const defaultUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " +
"AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/104.0.5112.79 Safari/537.36"
func (c *Config) Init(args []string) error {
flags := flag.NewFlagSet(args[0], flag.ExitOnError)
flags.String(flag.DefaultConfigFlagname, "", "Path to config file")
var (
userAgent = flags.String("user_agent", defaultUserAgent, "User-Agent HTTP header value")
contentType = flags.String("content_type", "text/html", "Content-Type HTTP header value")
url = flags.String("url", "", "URL to crawl if not daemon")
maxWorkers = flags.Int("max_workers", 8, "maximum number of goroutines")
tick = flags.Duration("tick", defaultTick, "Polling of worker goroutines before crawling")
)
if err := flags.Parse(args[1:]); err != nil {
return err
}
c.UserAgent = *userAgent
c.ContentType = *contentType
c.Url = *url
c.MaxWorkers = *maxWorkers
return nil
}
func main() {
c := &Config{}
c.Init(os.Args)
ctx := context.WithValue(context.Background(), "config", c)
ctx, cancel := context.WithCancel(ctx)
defer func() {
cancel()
}()
if err := run(ctx, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
}
func initDatabase(db *sql.DB) error {
db, err := sql.Open("sqlite3", "./gocrawl.db")
if err != nil {
return err
}
if err = graph.InitDbFile(db); err != nil {
return err
}
return nil
}
func run(ctx context.Context, out io.Writer) error {
log.SetOutput(out)
var db *sql.DB = nil
if err := initDatabase(&db); err != nil {
return err
}
defer db.Close()
var pageQueue = make(chan *Page, 100)
log.Printf("Spawning %d workers", ctx.Value("config").MaxWorkers)
for i := 0; i < ctx.Value("config").MaxWorkers; i++ {
go crawler.Bot(ctx, db, pageQueue)
}
}