diff --git a/Makefile b/Makefile index dee04fe..c110fd1 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION=0.2.0 +VERSION=0.2.1 all: lint test @@ -9,15 +9,14 @@ lint: @go fmt ./... @golint ./... -test: build +test: test-src test-cli +test-src: + @go test ./... +test-cli: build @./htmltable2csv -v @./htmltable2csv -source "./scraper/fixture/test1.html" -selector "table > tbody > tr" -csv data_file.csv @./htmltable2csv -source "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data_url.csv -test-all: - @go test ./... - @make test - release: git tag -a v${VERSION} -m "Version ${VERSION}" git push origin v${VERSION} diff --git a/README.md b/README.md index a02cc6f..efb0416 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ Flags: The table css selector -source string The filepath or website url + -trim + Trim the whitespace for each table column -v Print the version and exit Examples: diff --git a/main.go b/main.go index eada04f..f79752d 100644 --- a/main.go +++ b/main.go @@ -29,6 +29,7 @@ func main() { flagVersion := flag.Bool("v", false, "Print the version and exit") flagSource := flag.String("source", "", "The filepath or website url") flagSelector := flag.String("selector", "", "The table css selector") + flagTrim := flag.Bool("trim", false, "Trim the whitespace for each table column") flagCSV := flag.String("csv", "", "The csv filename. if empty, print csv to stdout") flag.Usage = usage flag.Parse() @@ -52,6 +53,7 @@ func main() { scraper := htmltable2csv.Scraper{} scraper.Source = *flagSource scraper.Selector = *flagSelector + scraper.Trim = *flagTrim _, err = scraper.Scrape() if err != nil { fmt.Println(err) diff --git a/package.json b/package.json index 4d8fef8..9be9b51 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "htmltable2csv", - "version": "0.2.0", + "version": "0.2.1", "description": "htmltable2csv is a tool to parse a html table and store the data as csv. It can be written to a file or print out to stdout", "scripts": { "test": "make test" diff --git a/scraper/fixture/test2.html b/scraper/fixture/test2.html new file mode 100644 index 0000000..40bd247 --- /dev/null +++ b/scraper/fixture/test2.html @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + +
keyvalue
+ foo + + 1 +
+ bar + + 2 + + + + +
+ baz + + + + + + + 3 + + + + + + + +
diff --git a/scraper/scraper.go b/scraper/scraper.go index 3d17525..a890745 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -7,6 +7,7 @@ import ( "net/http" "net/url" "os" + "strings" "github.com/PuerkitoBio/goquery" ) @@ -16,6 +17,7 @@ type Scraper struct { Source string Selector string Data [][]string + Trim bool } // Scrape download and parse the table data @@ -56,7 +58,11 @@ func (s *Scraper) Scrape() ([][]string, error) { doc.Find(s.Selector).Each(func(i int, table *goquery.Selection) { dataRow := make([]string, 0) table.Find("td").Each(func(j int, td *goquery.Selection) { - dataRow = append(dataRow, td.Text()) + text := td.Text() + if s.Trim { + text = strings.TrimSpace(text) + } + dataRow = append(dataRow, text) }) data = append(data, dataRow) }) diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index 60e02e9..930415c 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -18,6 +18,18 @@ func TestScraper(t *testing.T) { dataEqual(t, data) }) + t.Run("source file and trim", func(t *testing.T) { + scraper := Scraper{} + scraper.Source = "./fixture/test2.html" + scraper.Selector = "table > tbody > tr" + scraper.Trim = true + data, err := scraper.Scrape() + if err != nil { + t.Error(err) + } + dataEqual(t, data) + }) + t.Run("source url", func(t *testing.T) { // Start a local HTTP server server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {