From d362fae970c09abc8c0c97869f891857ed84885a Mon Sep 17 00:00:00 2001 From: Paul Date: Sat, 21 Aug 2021 11:27:33 +0200 Subject: [PATCH 1/9] Add basic toc generation for pandoc (WIP) Pandoc template missing right now --- markup/markup_config/config.go | 3 + markup/pandoc/convert.go | 128 +++++++++++++++++++++++++- markup/pandoc/pandoc_config/config.go | 27 ++++++ 3 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 markup/pandoc/pandoc_config/config.go diff --git a/markup/markup_config/config.go b/markup/markup_config/config.go index e254ba7a03e..dfd0280b0aa 100644 --- a/markup/markup_config/config.go +++ b/markup/markup_config/config.go @@ -20,6 +20,7 @@ import ( "github.com/gohugoio/hugo/markup/asciidocext/asciidocext_config" "github.com/gohugoio/hugo/markup/goldmark/goldmark_config" "github.com/gohugoio/hugo/markup/highlight" + "github.com/gohugoio/hugo/markup/pandoc/pandoc_config" "github.com/gohugoio/hugo/markup/tableofcontents" "github.com/gohugoio/hugo/parser" "github.com/mitchellh/mapstructure" @@ -37,6 +38,7 @@ type Config struct { // Content renderers Goldmark goldmark_config.Config AsciidocExt asciidocext_config.Config + Pandoc pandoc_config.Config } func Decode(cfg config.Provider) (conf Config, err error) { @@ -84,6 +86,7 @@ var Default = Config{ Goldmark: goldmark_config.Default, AsciidocExt: asciidocext_config.Default, + Pandoc: pandoc_config.Default, } func init() { diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index ae90cf41770..41adedf5023 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -15,12 +15,16 @@ package pandoc import ( + "bytes" + "github.com/gohugoio/hugo/common/hexec" "github.com/gohugoio/hugo/htesting" "github.com/gohugoio/hugo/identity" "github.com/gohugoio/hugo/markup/internal" + "golang.org/x/net/html" "github.com/gohugoio/hugo/markup/converter" + "github.com/gohugoio/hugo/markup/tableofcontents" ) // Provider is the package entry point. @@ -38,17 +42,29 @@ func (p provider) New(cfg converter.ProviderConfig) (converter.Provider, error) }), nil } +type pandocResult struct { + converter.Result + toc tableofcontents.Root +} + +func (r pandocResult) TableOfContents() tableofcontents.Root { + return r.toc +} + type pandocConverter struct { ctx converter.DocumentContext cfg converter.ProviderConfig } func (c *pandocConverter) Convert(ctx converter.RenderContext) (converter.Result, error) { - b, err := c.getPandocContent(ctx.Src, c.ctx) + content, toc, err := c.extractTOC(c.getPandocContent(ctx.Src, c.ctx)) if err != nil { return nil, err } - return converter.Bytes(b), nil + return pandocResult{ + Result: converter.Bytes(content), + toc: toc, + }, nil } func (c *pandocConverter) Supports(feature identity.Identity) bool { @@ -64,7 +80,7 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon " Leaving pandoc content unrendered.") return src, nil } - args := []string{"--mathjax"} + args := []string{"--mathjax", "--toc", "--template", "toc", "-s", "--quiet"} return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args) } @@ -77,6 +93,112 @@ func getPandocBinaryName() string { return "" } +// extractTOC extracts the toc from the given src html. +// It returns the html without the TOC, and the TOC data +func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, error) { + var buf bytes.Buffer + buf.Write(src) + node, err := html.Parse(&buf) + if err != nil { + return nil, tableofcontents.Root{}, err + } + var ( + f func(*html.Node) bool + toc tableofcontents.Root + toVisit []*html.Node + ) + f = func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "nav" && attr(n, "id") == "TOC" { + toc = parseTOC(n) + if !a.cfg.MarkupConfig.Pandoc.PreserveTOC { + n.Parent.RemoveChild(n) + } + return true + } + if n.FirstChild != nil { + toVisit = append(toVisit, n.FirstChild) + } + if n.NextSibling != nil && f(n.NextSibling) { + return true + } + for len(toVisit) > 0 { + nv := toVisit[0] + toVisit = toVisit[1:] + if f(nv) { + return true + } + } + return false + } + f(node) + if err != nil { + return nil, tableofcontents.Root{}, err + } + buf.Reset() + err = html.Render(&buf, node) + if err != nil { + return nil, tableofcontents.Root{}, err + } + // ltrim and rtrim which are added by html.Render + res := buf.Bytes()[25:] + res = res[:len(res)-14] + return res, toc, nil +} + +// parseTOC returns a TOC root from the given toc Node +func parseTOC(doc *html.Node) tableofcontents.Root { + var ( + toc tableofcontents.Root + f func(*html.Node, int, int) + ) + f = func(n *html.Node, row, level int) { + if n.Type == html.ElementNode { + switch n.Data { + case "ul": + if level == 0 { + row++ + } + level++ + f(n.FirstChild, row, level) + case "li": + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type != html.ElementNode || c.Data != "a" { + continue + } + href := attr(c, "href")[1:] + toc.AddAt(tableofcontents.Heading{ + Text: nodeContent(c), + ID: href, + }, row, level) + } + f(n.FirstChild, row, level) + } + } + if n.NextSibling != nil { + f(n.NextSibling, row, level) + } + } + f(doc.FirstChild, -1, 0) + return toc +} + +func attr(node *html.Node, key string) string { + for _, a := range node.Attr { + if a.Key == key { + return a.Val + } + } + return "" +} + +func nodeContent(node *html.Node) string { + var buf bytes.Buffer + for c := node.FirstChild; c != nil; c = c.NextSibling { + html.Render(&buf, c) + } + return buf.String() +} + // Supports returns whether Pandoc is installed on this computer. func Supports() bool { hasBin := getPandocBinaryName() != "" diff --git a/markup/pandoc/pandoc_config/config.go b/markup/pandoc/pandoc_config/config.go new file mode 100644 index 00000000000..03b0d7db094 --- /dev/null +++ b/markup/pandoc/pandoc_config/config.go @@ -0,0 +1,27 @@ +// Copyright 2020 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pandocdoc_config holds pandoc related configuration. +package pandoc_config + +var ( + // Default holds Hugo's default pandoc configuration. + Default = Config{ + PreserveTOC: false, + } +) + +// Config configures pandoc. +type Config struct { + PreserveTOC bool +} From 6de4f5cde5e7bb8dcdea25effc076175069164bc Mon Sep 17 00:00:00 2001 From: Paul Date: Sat, 21 Aug 2021 11:54:52 +0200 Subject: [PATCH 2/9] Use pandoc's default standalone template Extract body from template first --- markup/pandoc/convert.go | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index 41adedf5023..6f40cb54044 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -80,7 +80,7 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon " Leaving pandoc content unrendered.") return src, nil } - args := []string{"--mathjax", "--toc", "--template", "toc", "-s", "--quiet"} + args := []string{"--mathjax", "--toc", "-s", "--quiet"} return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args) } @@ -102,11 +102,38 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, if err != nil { return nil, tableofcontents.Root{}, err } + var ( f func(*html.Node) bool + body *html.Node toc tableofcontents.Root toVisit []*html.Node ) + + f = func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "body" { + body = n + return true + } + if n.FirstChild != nil { + toVisit = append(toVisit, n.FirstChild) + } + if n.NextSibling != nil && f(n.NextSibling) { + return true + } + for len(toVisit) > 0 { + nv := toVisit[0] + toVisit = toVisit[1:] + if f(nv) { + return true + } + } + return false + } + if !f(node) { + return nil, tableofcontents.Root{}, err + } + f = func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "nav" && attr(n, "id") == "TOC" { toc = parseTOC(n) @@ -130,12 +157,12 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, } return false } - f(node) + f(body) if err != nil { return nil, tableofcontents.Root{}, err } buf.Reset() - err = html.Render(&buf, node) + err = html.Render(&buf, body) if err != nil { return nil, tableofcontents.Root{}, err } From 769439cc5b456392669e3db8742822ea9d42dca1 Mon Sep 17 00:00:00 2001 From: Paul Date: Sat, 21 Aug 2021 12:01:03 +0200 Subject: [PATCH 3/9] Add dummy title Else pandoc complains about missing title --- markup/pandoc/convert.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index 6f40cb54044..974bf643a36 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -80,7 +80,7 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon " Leaving pandoc content unrendered.") return src, nil } - args := []string{"--mathjax", "--toc", "-s", "--quiet"} + args := []string{"--mathjax", "--toc", "-s", "--metadata", "title=dummy"} return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args) } From f1b1e5f41b168d1bf57661a94ba8f9132100d2b4 Mon Sep 17 00:00:00 2001 From: Paul Date: Sat, 21 Aug 2021 16:06:47 +0200 Subject: [PATCH 4/9] remove generated title, update trim positions --- markup/pandoc/convert.go | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index 974bf643a36..558e7e15ef3 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -110,6 +110,7 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, toVisit []*html.Node ) + // find body f = func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "body" { body = n @@ -134,6 +135,30 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, return nil, tableofcontents.Root{}, err } + // remove by pandoc generated title + f = func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "header" && attr(n, "id") == "title-block-header" { + n.Parent.RemoveChild(n) + return true + } + if n.FirstChild != nil { + toVisit = append(toVisit, n.FirstChild) + } + if n.NextSibling != nil && f(n.NextSibling) { + return true + } + for len(toVisit) > 0 { + nv := toVisit[0] + toVisit = toVisit[1:] + if f(nv) { + return true + } + } + return false + } + f(body) + + // find toc f = func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "nav" && attr(n, "id") == "TOC" { toc = parseTOC(n) @@ -167,8 +192,8 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, return nil, tableofcontents.Root{}, err } // ltrim and rtrim which are added by html.Render - res := buf.Bytes()[25:] - res = res[:len(res)-14] + res := buf.Bytes()[6:] + res = res[:len(res)-7] return res, toc, nil } From aaed290b02ec7cf8d52923365444482ad1fe48a2 Mon Sep 17 00:00:00 2001 From: Paul Date: Sun, 19 Jun 2022 11:40:53 +0200 Subject: [PATCH 5/9] fix compile error from rebase and improve error handling --- markup/pandoc/convert.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index 558e7e15ef3..ed58b28c583 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -57,7 +57,11 @@ type pandocConverter struct { } func (c *pandocConverter) Convert(ctx converter.RenderContext) (converter.Result, error) { - content, toc, err := c.extractTOC(c.getPandocContent(ctx.Src, c.ctx)) + contentWithToc, err := c.getPandocContent(ctx.Src, c.ctx) + if err != nil { + return nil, err + } + content, toc, err := c.extractTOC(contentWithToc) if err != nil { return nil, err } From 35461c167fe26900553e981a725d54d6e82389ba Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Oct 2022 19:13:28 +0200 Subject: [PATCH 6/9] fix parsing of pandoc result --- markup/pandoc/convert.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index ed58b28c583..fca7f098c06 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -195,9 +195,9 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, if err != nil { return nil, tableofcontents.Root{}, err } - // ltrim and rtrim which are added by html.Render - res := buf.Bytes()[6:] - res = res[:len(res)-7] + // ltrim \n\n and rtrim \n\n which are added by html.Render + res := buf.Bytes()[8:] + res = res[:len(res)-9] return res, toc, nil } From c38633a0c41fe23a5216220f81bc30982e5ba78e Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Oct 2022 19:13:37 +0200 Subject: [PATCH 7/9] add basic integrations tests to pandoc --- markup/pandoc/integration_test.go | 62 +++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 markup/pandoc/integration_test.go diff --git a/markup/pandoc/integration_test.go b/markup/pandoc/integration_test.go new file mode 100644 index 00000000000..477ba43ba1a --- /dev/null +++ b/markup/pandoc/integration_test.go @@ -0,0 +1,62 @@ +// Copyright 2021 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pandoc_test + +import ( + "testing" + + "github.com/gohugoio/hugo/hugolib" +) + +func TestBasicConversion(t *testing.T) { + t.Parallel() + + files := ` +-- config.toml -- +-- content/p1.md -- +testContent +-- layouts/_default/single.html -- +{{ .Content }} +` + b := hugolib.NewIntegrationTestBuilder( + hugolib.IntegrationTestConfig{ + T: t, + TxtarString: files, + NeedsOsFS: true, + }, + ).Build() + + b.AssertFileContent("public/p1/index.html", `

testContent

`) +} + +func TestConversionWithHeader(t *testing.T) { + t.Parallel() + + files := ` +-- config.toml -- +-- content/p1.md -- +# testContent +-- layouts/_default/single.html -- +{{ .Content }} +` + b := hugolib.NewIntegrationTestBuilder( + hugolib.IntegrationTestConfig{ + T: t, + TxtarString: files, + NeedsOsFS: true, + }, + ).Build() + + b.AssertFileContent("public/p1/index.html", `

testContent

`) +} From addc311e5689e645d92d536e43e3089391294bed Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 9 Oct 2023 15:52:25 +0200 Subject: [PATCH 8/9] fix convert.go for changes made in other parts --- markup/pandoc/convert.go | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index bb44afb399f..ff6fe001d2c 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -43,11 +43,11 @@ func (p provider) New(cfg converter.ProviderConfig) (converter.Provider, error) } type pandocResult struct { - converter.Result - toc tableofcontents.Root + converter.ResultRender + toc *tableofcontents.Fragments } -func (r pandocResult) TableOfContents() tableofcontents.Root { +func (r pandocResult) TableOfContents() *tableofcontents.Fragments { return r.toc } @@ -66,8 +66,8 @@ func (c *pandocConverter) Convert(ctx converter.RenderContext) (converter.Result return nil, err } return pandocResult{ - Result: converter.Bytes(content), - toc: toc, + ResultRender: converter.Bytes(content), + toc: toc, }, nil } @@ -99,18 +99,18 @@ func getPandocBinaryName() string { // extractTOC extracts the toc from the given src html. // It returns the html without the TOC, and the TOC data -func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, error) { +func (a *pandocConverter) extractTOC(src []byte) ([]byte, *tableofcontents.Fragments, error) { var buf bytes.Buffer buf.Write(src) node, err := html.Parse(&buf) if err != nil { - return nil, tableofcontents.Root{}, err + return nil, nil, err } var ( f func(*html.Node) bool body *html.Node - toc tableofcontents.Root + toc *tableofcontents.Fragments toVisit []*html.Node ) @@ -136,7 +136,7 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, return false } if !f(node) { - return nil, tableofcontents.Root{}, err + return nil, nil, err } // remove by pandoc generated title @@ -166,7 +166,7 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, f = func(n *html.Node) bool { if n.Type == html.ElementNode && n.Data == "nav" && attr(n, "id") == "TOC" { toc = parseTOC(n) - if !a.cfg.MarkupConfig.Pandoc.PreserveTOC { + if !a.cfg.MarkupConfig().Pandoc.PreserveTOC { n.Parent.RemoveChild(n) } return true @@ -188,12 +188,12 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, } f(body) if err != nil { - return nil, tableofcontents.Root{}, err + return nil, nil, err } buf.Reset() err = html.Render(&buf, body) if err != nil { - return nil, tableofcontents.Root{}, err + return nil, nil, err } // ltrim \n\n and rtrim \n\n which are added by html.Render res := buf.Bytes()[8:] @@ -202,9 +202,9 @@ func (a *pandocConverter) extractTOC(src []byte) ([]byte, tableofcontents.Root, } // parseTOC returns a TOC root from the given toc Node -func parseTOC(doc *html.Node) tableofcontents.Root { +func parseTOC(doc *html.Node) *tableofcontents.Fragments { var ( - toc tableofcontents.Root + toc tableofcontents.Builder f func(*html.Node, int, int) ) f = func(n *html.Node, row, level int) { @@ -222,9 +222,9 @@ func parseTOC(doc *html.Node) tableofcontents.Root { continue } href := attr(c, "href")[1:] - toc.AddAt(tableofcontents.Heading{ - Text: nodeContent(c), - ID: href, + toc.AddAt(&tableofcontents.Heading{ + Title: nodeContent(c), + ID: href, }, row, level) } f(n.FirstChild, row, level) @@ -235,7 +235,7 @@ func parseTOC(doc *html.Node) tableofcontents.Root { } } f(doc.FirstChild, -1, 0) - return toc + return toc.Build() } func attr(node *html.Node, key string) string { From fbbcca40c8214e05bca9801684eb1cc7f8ac3723 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 9 Oct 2023 15:52:43 +0200 Subject: [PATCH 9/9] add integration test to test toc generation --- markup/pandoc/integration_test.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/markup/pandoc/integration_test.go b/markup/pandoc/integration_test.go index 477ba43ba1a..926af38ad1c 100644 --- a/markup/pandoc/integration_test.go +++ b/markup/pandoc/integration_test.go @@ -60,3 +60,26 @@ func TestConversionWithHeader(t *testing.T) { b.AssertFileContent("public/p1/index.html", `

testContent

`) } + +func TestConversionWithExtractedToc(t *testing.T) { + t.Parallel() + + files := ` +-- config.toml -- +-- content/p1.md -- +# title 1 +## title 2 +-- layouts/_default/single.html -- +{{ .TableOfContents }} +{{ .Content }} +` + b := hugolib.NewIntegrationTestBuilder( + hugolib.IntegrationTestConfig{ + T: t, + TxtarString: files, + NeedsOsFS: true, + }, + ).Build() + + b.AssertFileContent("public/p1/index.html", "\n

title 1

\n

title 2

") +}