Skip to content

Commit

Permalink
Release v0.2.0 (#1)
Browse files Browse the repository at this point in the history
* moved default logger to private var
* added installation command
  • Loading branch information
nfx authored Sep 18, 2022
1 parent cc93695 commit 201b328
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 92 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ vendor:
go mod vendor

test:
go test ./... -coverprofile=coverage.out -timeout=10s
go test -coverpkg=./... -coverprofile=coverage.out -timeout=10s ./...

coverage: test
go tool cover -html=coverage.out
Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@
[![build](https://github.com/nfx/go-htmltable/workflows/build/badge.svg?branch=main)](https://github.com/nfx/go-htmltable/actions?query=workflow%3Abuild+branch%3Amain)


`htmltable` enables structured data extraction from HTML tables and URLs and requires almost no external dependencies.
`htmltable` enables structured data extraction from HTML tables and URLs and requires almost no external dependencies. Tested with Go 1.18.x and 1.19.x.

## Installation

```bash
go get github.com/nfx/go-htmltable
```

## Usage

You can retrieve a slice of `header`-annotated types using the `NewSlice*` contructors:

```go
import "github.com/nfx/go-htmltable"

type Ticker struct {
Symbol string `header:"Symbol"`
Security string `header:"Security"`
Expand Down
8 changes: 4 additions & 4 deletions example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func ExampleNewSliceFromUrl() {
fmt.Println(out[0].Symbol)
fmt.Println(out[0].Security)

// Output:
// Output:
// MMM
// 3M
}
Expand All @@ -45,7 +45,7 @@ func ExampleNewFromString() {
return nil
})

// Output:
// Output:
// found 2 tables
// c:2 d:5
// c:4 d:6
Expand All @@ -56,7 +56,7 @@ func ExampleNewFromURL() {
_, err := page.FindWithColumns("invalid", "column", "names")
fmt.Println(err)

// Output:
// Output:
// cannot find table with columns: invalid, column, names
}

Expand All @@ -69,4 +69,4 @@ func ExampleLogger() {
// Output:
// [INFO] found table [columns [Symbol Security SEC filings GICSSector GICS Sub-Industry Headquarters Location Date first added CIK Founded] count 504]
// [INFO] found table [columns [Date Added Ticker Added Security Removed Ticker Removed Security Reason] count 308]
}
}
12 changes: 9 additions & 3 deletions log.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@ import (

// Logger is a very simplistic structured logger, than should
// be overriden by integrations.
var Logger = func(_ context.Context, msg string, fields ...any) {
var Logger func(_ context.Context, msg string, fields ...any)

func init() {
Logger = defaultLogger
}

var defaultLogger = func(_ context.Context, msg string, fields ...any) {
var sb strings.Builder
sb.WriteString(msg)
if len(fields) % 2 != 0 {
if len(fields)%2 != 0 {
panic(fmt.Errorf("number of logged fields is not even"))
}
for i := 0; i < len(fields); i += 2 {
Expand All @@ -22,4 +28,4 @@ var Logger = func(_ context.Context, msg string, fields ...any) {
sb.WriteString(fmt.Sprint(fields[i+1]))
}
log.Print(sb.String())
}
}
4 changes: 2 additions & 2 deletions log_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ func TestLoggerNoFields(t *testing.T) {
}

func TestLoggerWrongFields(t *testing.T) {
defer func(){
defer func() {
p := recover()
if p == nil {
t.Fatalf("there must be panic")
}
}()
Logger(context.Background(), "message", 1)
}
}
117 changes: 57 additions & 60 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,66 +17,66 @@ var htmlParse = html.Parse

var maxPossibleHeaderRows = 5

type page struct {
tables []*tableData
StartHeaderAt int
ctx context.Context
rowSpans []int
colSpans []int
row []string
rows [][]string
// Page is the container for all tables parseable
type Page struct {
Tables []*Table

ctx context.Context
rowSpans []int
colSpans []int
row []string
rows [][]string
}

// New returns an instance of the page with possibly more than one table
func New(ctx context.Context, r io.Reader) (*page, error) {
p := &page{
ctx: ctx,
}
err := p.init(r)
return p, err
func New(ctx context.Context, r io.Reader) (*Page, error) {
p := &Page{ctx: ctx}
return p, p.init(r)
}

// NewFromString is same as New(ctx.Context, io.Reader), but from string
func NewFromString(r string) (*page, error) {
func NewFromString(r string) (*Page, error) {
return New(context.Background(), strings.NewReader(r))
}

// NewFromResponse is same as New(ctx.Context, io.Reader), but from http.Response.
//
// In case of failure, returns `ResponseError`, that could be further inspected.
func NewFromResponse(resp *http.Response) (*page, error) {
func NewFromResponse(resp *http.Response) (*Page, error) {
p, err := New(resp.Request.Context(), resp.Body)
if err != nil {
// wrap error with http response
err = &ResponseError{resp, err}
return nil, err
}
return p, err
return p, nil
}

// NewFromURL is same as New(ctx.Context, io.Reader), but from URL.
//
// In case of failure, returns `ResponseError`, that could be further inspected.
func NewFromURL(url string) (*page, error) {
func NewFromURL(url string) (*Page, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
if resp.Body != nil {
defer resp.Body.Close()
}
return NewFromResponse(resp)
}

// Len returns number of tables found on the page
func (p *page) Len() int {
return len(p.tables)
func (p *Page) Len() int {
return len(p.Tables)
}

// FindWithColumns performs fuzzy matching of tables by given header column names
func (p *page) FindWithColumns(columns ...string) (*tableData, error) {
func (p *Page) FindWithColumns(columns ...string) (*Table, error) {
// realistic p won't have this much
found := 0xfffffff
for idx, table := range p.tables {
for idx, table := range p.Tables {
matchedColumns := 0
for _, col := range columns {
for _, header := range table.header {
for _, header := range table.Header {
if col == header {
// perform fuzzy matching of table headers
matchedColumns++
Expand All @@ -86,35 +86,35 @@ func (p *page) FindWithColumns(columns ...string) (*tableData, error) {
if matchedColumns != len(columns) {
continue
}
if found < len(p.tables) {
if found < len(p.Tables) {
// and do a best-effort error message, that is cleaner than pandas.read_html
return nil, fmt.Errorf("more than one table matches columns `%s`: "+
"[%d] %s and [%d] %s", strings.Join(columns, ", "),
found, p.tables[found], idx, p.tables[idx])
found, p.Tables[found], idx, p.Tables[idx])
}
found = idx
}
if found > len(p.tables) {
if found > len(p.Tables) {
return nil, fmt.Errorf("cannot find table with columns: %s",
strings.Join(columns, ", "))
}
return p.tables[found], nil
return p.Tables[found], nil
}

// Each row would call func with the value of the table cell from the column
// specified in the first argument.
//
// Returns an error if table has no matching column name.
func (p *page) Each(a string, f func(a string) error) error {
func (p *Page) Each(a string, f func(a string) error) error {
table, err := p.FindWithColumns(a)
if err != nil {
return err
}
offsets := map[string]int{}
for idx, header := range table.header {
for idx, header := range table.Header {
offsets[header] = idx
}
for idx, row := range table.rows {
for idx, row := range table.Rows {
err = f(row[offsets[a]])
if err != nil {
return fmt.Errorf("row %d: %w", idx, err)
Expand All @@ -127,17 +127,17 @@ func (p *page) Each(a string, f func(a string) error) error {
// and call the func with those values for every row in the table.
//
// Returns an error if table has no matching column names.
func (p *page) Each2(a, b string, f func(a, b string) error) error {
func (p *Page) Each2(a, b string, f func(a, b string) error) error {
table, err := p.FindWithColumns(a, b)
if err != nil {
return err
}
offsets := map[string]int{}
for idx, header := range table.header {
for idx, header := range table.Header {
offsets[header] = idx
}
_1, _2 := offsets[a], offsets[b]
for idx, row := range table.rows {
for idx, row := range table.Rows {
err = f(row[_1], row[_2])
if err != nil {
return fmt.Errorf("row %d: %w", idx, err)
Expand All @@ -150,17 +150,17 @@ func (p *page) Each2(a, b string, f func(a, b string) error) error {
// and call the func with those values for every row in the table.
//
// Returns an error if table has no matching column names.
func (p *page) Each3(a, b, c string, f func(a, b, c string) error) error {
func (p *Page) Each3(a, b, c string, f func(a, b, c string) error) error {
table, err := p.FindWithColumns(a, b, c)
if err != nil {
return err
}
offsets := map[string]int{}
for idx, header := range table.header {
for idx, header := range table.Header {
offsets[header] = idx
}
_1, _2, _3 := offsets[a], offsets[b], offsets[c]
for idx, row := range table.rows {
for idx, row := range table.Rows {
err = f(row[_1], row[_2], row[_3])
if err != nil {
return fmt.Errorf("row %d: %w", idx, err)
Expand All @@ -169,7 +169,7 @@ func (p *page) Each3(a, b, c string, f func(a, b, c string) error) error {
return nil
}

func (p *page) init(r io.Reader) error {
func (p *Page) init(r io.Reader) error {
root, err := htmlParse(r)
if err != nil {
return err
Expand All @@ -179,7 +179,7 @@ func (p *page) init(r io.Reader) error {
return nil
}

func (p *page) parse(n *html.Node) {
func (p *Page) parse(n *html.Node) {
if n == nil {
return
}
Expand Down Expand Up @@ -214,7 +214,7 @@ func (p *page) parse(n *html.Node) {
}
}

func (p *page) intAttrOr(n *html.Node, attr string, default_ int) int {
func (p *Page) intAttrOr(n *html.Node, attr string, default_ int) int {
for _, a := range n.Attr {
if a.Key != attr {
continue
Expand All @@ -228,15 +228,15 @@ func (p *page) intAttrOr(n *html.Node, attr string, default_ int) int {
return default_
}

func (p *page) finishRow() {
func (p *Page) finishRow() {
if len(p.row) == 0 {
return
}
p.rows = append(p.rows, p.row[:])
p.row = []string{}
}

func (p *page) finishTable() {
func (p *Page) finishTable() {
p.finishRow()
if len(p.rows) == 0 {
return
Expand Down Expand Up @@ -274,16 +274,16 @@ func (p *page) finishTable() {
dataOffset += 1
}
Logger(p.ctx, "found table", "columns", header, "count", len(p.rows))
p.tables = append(p.tables, &tableData{
header: header,
rows: p.rows[dataOffset:],
p.Tables = append(p.Tables, &Table{
Header: header,
Rows: p.rows[dataOffset:],
})
p.rows = [][]string{}
p.colSpans = []int{}
p.rowSpans = []int{}
}

func (p *page) innerText(n *html.Node, sb *strings.Builder) {
func (p *Page) innerText(n *html.Node, sb *strings.Builder) {
if n.Type == html.TextNode {
sb.WriteString(strings.TrimSpace(n.Data))
return
Expand All @@ -296,20 +296,17 @@ func (p *page) innerText(n *html.Node, sb *strings.Builder) {
}
}

type ResponseError struct {
Response *http.Response
Inner error
}

func (re *ResponseError) Error() string {
return re.Inner.Error()
}
// Table is the low-level representation of raw header and rows.
//
// Every cell string value is truncated of its whitespace.
type Table struct {
// Header holds names of headers
Header []string

type tableData struct {
header []string
rows [][]string
// Rows holds slice of string slices
Rows [][]string
}

func (table *tableData) String() string {
return fmt.Sprintf("Table[%s] (%d rows)", strings.Join(table.header, ", "), len(table.rows))
func (table *Table) String() string {
return fmt.Sprintf("Table[%s] (%d rows)", strings.Join(table.Header, ", "), len(table.Rows))
}
Loading

0 comments on commit 201b328

Please sign in to comment.