Skip to content

Commit

Permalink
Store configuration in context
Browse files Browse the repository at this point in the history
  • Loading branch information
leozqi committed May 29, 2024
1 parent 0bee692 commit cb2ad58
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 95 deletions.
124 changes: 33 additions & 91 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"io"
"log"
"os"
"sync"
"time"

_ "github.com/mattn/go-sqlite3"
Expand All @@ -32,29 +31,22 @@ import (
"gocrawl/internal/graph"
)

// https://ieftimov.com/posts/four-steps-daemonize-your-golang-programs/

type config struct {
daemon bool
userAgent string
contentType string
url string
maxWorkers int
tick time.Duration
type Config struct {
UserAgent string
ContentType string
Url string
MaxWorkers int
}

const defaultTick = 500 * time.Millisecond

const defaultUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"
const defaultUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " +
"AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/104.0.5112.79 Safari/537.36"

func (c *config) init(args []string) error {
func (c *Config) Init(args []string) error {
flags := flag.NewFlagSet(args[0], flag.ExitOnError)

// -config
flags.String(flag.DefaultConfigFlagname, "", "Path to config file")

var (
daemon = flags.Bool("daemon", false, "Worker instance")
userAgent = flags.String("user_agent", defaultUserAgent, "User-Agent HTTP header value")
contentType = flags.String("content_type", "text/html", "Content-Type HTTP header value")
url = flags.String("url", "", "URL to crawl if not daemon")
Expand All @@ -66,105 +58,55 @@ func (c *config) init(args []string) error {
return err
}

c.daemon = *daemon
c.userAgent = *userAgent
c.contentType = *contentType
c.url = *url
c.maxWorkers = *maxWorkers
c.tick = *tick
c.UserAgent = *userAgent
c.ContentType = *contentType
c.Url = *url
c.MaxWorkers = *maxWorkers

return nil
}

func main() {
ctx := context.Background()
ctx, cancel := context.WithCancel(ctx)

c := &config{}
c.init(os.Args)
c := &Config{}
c.Init(os.Args)

ctx := context.WithValue(context.Background(), "config", c)
ctx, cancel := context.WithCancel(ctx)
defer func() {
cancel()
}()

if err := run(ctx, c, os.Stdout); err != nil {
if err := run(ctx, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
}

func runCmd(ctx context.Context, db *sql.DB, c *config) {
if c.url != "" {
p := graph.NewPage()
p.Id = c.url
if err := graph.InsertPage(ctx, db, p); err != nil {
log.Printf("Error adding url %s to jobs list: %s\n", p.Id, err)
} else {
log.Printf("Added url %s to jobs list\n", c.url)
}
}
}

func run(ctx context.Context, c *config, out io.Writer) error {
log.SetOutput(out)

func initDatabase(db *sql.DB) error {
db, err := sql.Open("sqlite3", "./gocrawl.db")
if err != nil {
return err
}
defer db.Close()

if err = graph.InitDbFile(db); err != nil {
return err
}

if !c.daemon {
runCmd(ctx, db, c)
return nil
}
return nil
}

// We will spawn this many goroutines
var throttle = make(chan int, c.maxWorkers)
taskMgr := &sync.Mutex{}
for i := 0; i < c.maxWorkers; i++ {
throttle <- 1
}
log.Printf("Spawning %d workers", c.maxWorkers)

for {
select {
case <-ctx.Done():
return nil
case <-throttle:
log.Printf("Spawned worker")
go work(ctx, db, c, throttle, taskMgr)
}
func run(ctx context.Context, out io.Writer) error {
log.SetOutput(out)
var db *sql.DB = nil
if err := initDatabase(&db); err != nil {
return err
}
}
defer db.Close()

var pageQueue = make(chan *Page, 100)

func work(ctx context.Context, db *sql.DB, c *config, throttle chan int, taskMgr *sync.Mutex) {
for {
select {
case <-ctx.Done():
throttle <- 1
return
case <-time.Tick(c.tick):
p := graph.NewPage()
taskMgr.Lock()
if err := graph.GetJob(ctx, db, p); err != nil {
log.Printf("Could not get DB job: %s", err)
} else {
log.Printf("Crawling URL %s", p.Id)
}
taskMgr.Unlock()

crawler.Crawl(ctx, db, p)
graph.UpdatePage(ctx, db, p)

taskMgr.Lock()
if err := crawler.AddNewJobs(ctx, db, p); err != nil {
log.Printf("Could not add new jobs: %s", err)
}
taskMgr.Unlock()
}
log.Printf("Spawning %d workers", ctx.Value("config").MaxWorkers)
for i := 0; i < ctx.Value("config").MaxWorkers; i++ {
go crawler.Bot(ctx, db, pageQueue)
}
}
20 changes: 16 additions & 4 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ import (
"golang.org/x/net/html"
"hash/crc32"
"io/ioutil"
"log"
"net/http"
"strings"
"net/url"
"log"
"strings"
)

func DownloadPage(url string) ([]byte, error) {
Expand Down Expand Up @@ -101,7 +101,19 @@ func ParseDOMString(dom string, keywordTags *utils.Set) (*utils.Set, *utils.Set)
return unique, links
}

func Crawl(ctx context.Context, db *sql.DB, p *graph.Page) error {
func Bot(ctx context.Context, db *sql.DB, pageQueue chan *Page) {
for {
select {
case <-ctx.Done():
return
case p <- pageQueue:
log.Printf("Crawling URL %s", p.Id)
Crawl(ctx, p)
}
}
}

func Crawl(ctx context.Context, p *graph.Page) error {
pageBytes, err := DownloadPage(p.Id)
if err != nil {
return err
Expand Down Expand Up @@ -144,7 +156,7 @@ func AddNewJobs(ctx context.Context, db *sql.DB, p *graph.Page) error {

job := graph.NewPage()
job.Id = link

err = graph.InsertPage(ctx, db, job)
if err != nil {
log.Printf("error for url %s; %s", link, err)
Expand Down

0 comments on commit cb2ad58

Please sign in to comment.