Skip to content

Commit

Permalink
Refactored/simplified code, fixed #9
Browse files Browse the repository at this point in the history
Removed most of the code, replaced it with
github.com/mrusme/journalist/crawler
  • Loading branch information
mrusme committed Aug 30, 2022
1 parent 71c16a2 commit 43d987f
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 773 deletions.
105 changes: 27 additions & 78 deletions cmd/root.go
Original file line number Diff line number Diff line change
@@ -1,29 +1,24 @@
package cmd

import (
"bufio"
"fmt"
"image/color"
"io"
"net/http"
"net/http/cookiejar"
"net/url"
"os"
"regexp"
"strconv"

"github.com/charmbracelet/glamour"
"github.com/eliukblau/pixterm/pkg/ansimage"
"github.com/go-shiori/go-readability"
"golang.org/x/crypto/ssh/terminal"
"golang.org/x/net/publicsuffix"

"golang.org/x/crypto/ssh/terminal"
md "github.com/JohannesKaufmann/html-to-markdown"
// scraper "github.com/cardigann/go-cloudflare-scraper"
"github.com/spf13/cobra"
scraper "github.com/tinoquang/go-cloudflare-scraper"

"github.com/mrusme/journalist/crawler"
"go.uber.org/zap"
)

var verbose bool
var noImages bool
var noPretty bool
var userAgent string
Expand All @@ -38,74 +33,16 @@ var mdImgRegex =
var mdImgPlaceholderRegex =
regexp.MustCompile(`(?m)\$\$\$([0-9]*)\$`)

func MakeReadable(rawUrl *string) (string, string, error) {
var reader io.ReadCloser
var urlUrl *url.URL
var err error

if *rawUrl == "-" {
reader, err = getReaderFromStdin()
} else {
urlUrl, err = url.Parse(*rawUrl)
if err != nil {
return "", "", err
}

switch(urlUrl.Scheme) {
case "http", "https":
reader, err = getReaderFromHTTP(rawUrl)
default:
reader, err = getReaderFromFile(rawUrl)
}
}
defer reader.Close()
func MakeReadable(rawUrl *string, logger *zap.Logger) (string, string, error) {
var crwlr *crawler.Crawler = crawler.New(logger)

article, err := readability.FromReader(reader, urlUrl)
crwlr.SetLocation(*rawUrl)
article, err := crwlr.GetReadable()
if err != nil {
return "", "", err
}

return article.Title, article.Content, nil
}

func getReaderFromHTTP(rawUrl *string) (io.ReadCloser, error) {
jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List})
if err != nil {
return nil, err
}

scraper, err := scraper.NewTransport(http.DefaultTransport)
client := &http.Client{
Jar: jar,
Transport: scraper,
}

req, err := http.NewRequest("GET", *rawUrl, nil)
if err != nil {
return nil, err
}

req.Header.Set("User-Agent",
userAgent)
req.Header.Set("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," +
"image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language",
"en-US,en;q=0.5")
req.Header.Set("DNT",
"1")

resp, err := client.Do(req)
if err != nil {
return nil, err
}
// defer resp.Body.Close()

return resp.Body, nil
}

func getReaderFromFile(rawUrl *string) (io.ReadCloser, error) {
return os.Open(*rawUrl)
return article.Title, article.ContentHtml, nil
}

func HTMLtoMarkdown(html *string) (string, error) {
Expand All @@ -119,10 +56,6 @@ func HTMLtoMarkdown(html *string) (string, error) {
return markdown, nil
}

func getReaderFromStdin() (io.ReadCloser, error) {
return io.NopCloser(bufio.NewReader(os.Stdin)), nil
}

func RenderImg(md string) (string, []InlineImage, error) {
var images []InlineImage

Expand Down Expand Up @@ -209,9 +142,18 @@ var rootCmd = &cobra.Command{
"pages on the CLI.",
Args: cobra.MinimumNArgs(1),
Run: func(cmd *cobra.Command, args []string) {
var logger *zap.Logger

if verbose == true {
logger, _ = zap.NewDevelopment()
} else {
logger, _ = zap.NewProduction()
}
defer logger.Sync()

rawUrl := args[0]

title, content, err := MakeReadable(&rawUrl)
title, content, err := MakeReadable(&rawUrl, logger)
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
Expand Down Expand Up @@ -263,6 +205,13 @@ func Execute() {
"Googlebot/2.1; +http://www.google.com/bot.html)",
"set custom user agent string",
)
rootCmd.Flags().BoolVarP(
&verbose,
"verbose",
"v",
false,
"verbose output",
)

if err := rootCmd.Execute(); err != nil {
fmt.Fprintln(os.Stderr, err)
Expand Down
25 changes: 17 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@ require (
github.com/JohannesKaufmann/html-to-markdown v1.3.5
github.com/charmbracelet/glamour v0.5.0
github.com/eliukblau/pixterm v1.3.1
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b
github.com/mrusme/journalist v1.0.1-0.20220830190010-0f512be110d3
github.com/spf13/cobra v1.5.0
github.com/tinoquang/go-cloudflare-scraper v0.0.0-20200802120520-9158f3ffb9bf
golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa
golang.org/x/net v0.0.0-20220728211354-c7608f3a8462
go.uber.org/zap v1.23.0
golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90
)

require (
Expand All @@ -21,24 +20,34 @@ require (
github.com/disintegration/imaging v1.6.2 // indirect
github.com/dlclark/regexp2 v1.7.0 // indirect
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/gorilla/css v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.0.1 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/mattn/go-isatty v0.0.16 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/microcosm-cc/bluemonday v1.0.19 // indirect
github.com/mmcdole/gofeed v1.1.3 // indirect
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/muesli/reflow v0.3.0 // indirect
github.com/muesli/termenv v0.12.0 // indirect
github.com/olekukonko/tablewriter v0.0.5 // indirect
github.com/rivo/uniseg v0.3.1 // indirect
github.com/rivo/uniseg v0.3.4 // indirect
github.com/robertkrimen/otto v0.0.0-20211024170158-b87d35c0b86f // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/tinoquang/go-cloudflare-scraper v0.0.0-20200802120520-9158f3ffb9bf // indirect
github.com/yuin/goldmark v1.4.13 // indirect
github.com/yuin/goldmark-emoji v1.0.1 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.8.0 // indirect
golang.org/x/image v0.0.0-20220722155232-062f8c9fd539 // indirect
golang.org/x/sys v0.0.0-20220731174439-a90be440212d // indirect
golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b // indirect
golang.org/x/sys v0.0.0-20220829200755-d48e67d00261 // indirect
golang.org/x/term v0.0.0-20220722155259-a9ba230a4035 // indirect
golang.org/x/text v0.3.7 // indirect
gopkg.in/sourcemap.v1 v1.0.5 // indirect
Expand Down
Loading

0 comments on commit 43d987f

Please sign in to comment.