Skip to content

Commit

Permalink
import: convert to html for fe
Browse files Browse the repository at this point in the history
  • Loading branch information
hayzamjs committed Nov 27, 2024
1 parent a69f3bc commit 8b980bb
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 43 deletions.
7 changes: 6 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module git.difuse.io/Difuse/kalmia

go 1.22.1
go 1.23

toolchain go1.23.2

require (
github.com/aws/aws-sdk-go v1.55.5
Expand All @@ -27,6 +29,8 @@ require (
dario.cat/mergo v1.0.1 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/ProtonMail/go-crypto v1.0.0 // indirect
github.com/PuerkitoBio/goquery v1.10.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/cloudflare/circl v1.4.0 // indirect
github.com/cyphar/filepath-securejoin v0.3.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
Expand Down Expand Up @@ -59,6 +63,7 @@ require (
github.com/skeema/knownhosts v1.3.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/xanzy/ssh-agent v0.3.3 // indirect
github.com/yuin/goldmark v1.7.8 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/net v0.29.0 // indirect
golang.org/x/sync v0.8.0 // indirect
Expand Down
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/ProtonMail/go-crypto v1.0.0 h1:LRuvITjQWX+WIfr930YHG2HNfjR1uOfyf5vE0kC2U78=
github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0=
github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8=
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
Expand Down Expand Up @@ -136,6 +140,8 @@ github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8
github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM=
github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
Expand All @@ -162,6 +168,7 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug
golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
Expand All @@ -185,13 +192,15 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM=
golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
Expand All @@ -202,6 +211,7 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224=
golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
Expand Down
95 changes: 53 additions & 42 deletions services/docs_import.go
Original file line number Diff line number Diff line change
@@ -1,71 +1,75 @@
package services

import (
"encoding/json"
"bytes"
"fmt"
"log"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"

"git.difuse.io/Difuse/kalmia/config"
"git.difuse.io/Difuse/kalmia/utils"
"github.com/gabriel-vasile/mimetype"
"github.com/PuerkitoBio/goquery"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/transport/http"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/renderer/html"
)

var MediaRegex = regexp.MustCompile(`!\[[^\]]*\]\(([^)]+)\)|\[[^\]]*\]\(([^)]+\.(?:jpg|jpeg|png|gif|mp4|mov|avi|mkv|webm|mp3|wav|ogg|flac|svg|webp))\)`)
func processMarkdown(content, dir string, cfg *config.Config) (string, error) {
gm := goldmark.New(
goldmark.WithRendererOptions(html.WithUnsafe()),
goldmark.WithParserOptions(parser.WithAutoHeadingID()),
)

func processMdAsset(content, dir string, cfg *config.Config) (string, error) {
URLSlice := MediaRegex.FindAllStringSubmatch(content, -1)
for _, sub := range URLSlice {
mediaPath := ""
var output bytes.Buffer

if sub[1] != "" {
mediaPath = sub[1]
} else if sub[2] != "" {
mediaPath = sub[2]
}
err := gm.Convert([]byte(content), &output)
if err != nil {
return "", err
}

if strings.HasPrefix(mediaPath, "http") {
continue
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(output.Bytes()))
if err != nil {
return "", fmt.Errorf("failed to parse HTML: %v", err)
}

decodedMediaPath, err := url.QueryUnescape(mediaPath)
if err != nil {
return content, fmt.Errorf("error decoding media path: %v", err)
}
doc.Find("[src]").Each(func(i int, s *goquery.Selection) {
src, exists := s.Attr("src")
if exists {
if strings.HasPrefix(src, "http") {
return
}

absPath, err := filepath.Abs(filepath.Join(dir, decodedMediaPath))
if err != nil {
return content, fmt.Errorf("error resolving media path: %v", err)
}
decodedSrc, err := url.QueryUnescape(src)
if err != nil {
return
}

file, err := os.Open(absPath)
if err != nil {
return content, fmt.Errorf("error opening media file: %v", err)
}
defer file.Close()
absPath := filepath.Join(dir, decodedSrc)
file, err := os.Open(absPath)

mime, _ := mimetype.DetectReader(file)
_, err = file.Seek(0, 0)
if err != nil {
log.Fatal(err)
}
if err == nil {
mime := utils.GetContentType(absPath)

s3URL, err := UploadToStorage(file, filepath.Base(mediaPath), mime.String(), cfg)
s3URL, err := UploadToStorage(file, filepath.Base(absPath), mime, cfg)

if err != nil {
return content, err
if err == nil {
s.SetAttr("src", s3URL)
}
}
}
})

content = strings.ReplaceAll(content, mediaPath, s3URL)
var updatedOutput bytes.Buffer

if err := goquery.Render(&updatedOutput, doc.Selection); err != nil {
return "", fmt.Errorf("failed to render updated HTML: %v", err)
}

return content, nil
return updatedOutput.String(), nil
}

func parseMarkdownFiles(dir string, doc map[string]interface{}, cfg *config.Config) error {
Expand Down Expand Up @@ -99,7 +103,7 @@ func parseMarkdownFiles(dir string, doc map[string]interface{}, cfg *config.Conf
}

strContent := string(content)
strContent, err = processMdAsset(strContent, dir, cfg)
strContent, err = processMarkdown(strContent, dir, cfg)
if err != nil {
return err
}
Expand Down Expand Up @@ -153,7 +157,14 @@ func (service *DocService) ImportGitbook(url, username, password string, cfg *co
return "", fmt.Errorf("failed to parse markdown files: %v", err)
}

jsonBytes, err := json.Marshal(doc)
for key, value := range doc {
if htmlContent, ok := value.(string); ok {
doc[key] = strings.ReplaceAll(htmlContent, "\n", "")
}
}

jsonBytes, err := utils.MarshalWithoutEscape(doc)

if err != nil {
return "", fmt.Errorf("failed to convert to JSON: %v", err)
}
Expand Down
85 changes: 85 additions & 0 deletions utils/markdown.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package utils

import (
"fmt"
"strings"

"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"golang.org/x/net/html"
)

// Updated ProcessHTML function
func ProcessHTML(node ast.Node, source []byte, output *strings.Builder) error {
for child := node.FirstChild(); child != nil; child = child.NextSibling() {
switch n := child.(type) {
case *ast.RawHTML: // Handle raw HTML
rawHTML := string(n.Text(source))
elementType, srcPath, caption, ok := ExtractAssetFromHTML(rawHTML)
if ok {
// Convert specific HTML elements to Markdown
switch elementType {
case "img":
output.WriteString(fmt.Sprintf("![%s](%s)\n", caption, srcPath))
case "video", "audio":
output.WriteString(fmt.Sprintf("[%s](%s)\n", caption, srcPath))
case "figure":
output.WriteString(fmt.Sprintf("![%s](%s)\n", caption, srcPath))
}
} else {
// Keep raw HTML if not matching
output.WriteString(rawHTML)
}
default: // Render Markdown elements as-is
if err := goldmark.DefaultRenderer().Render(output, source, n); err != nil {
return fmt.Errorf("error rendering markdown: %v", err)
}
}
}
return nil
}

// Extract HTML attributes for specific elements
func ExtractAssetFromHTML(content string) (string, string, string, bool) {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return "", "", "", false
}

var srcPath, elementType, caption string
var f func(n *html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
switch n.Data {
case "img", "audio", "video":
for _, attr := range n.Attr {
if attr.Key == "src" {
srcPath = attr.Val
elementType = n.Data
}
}
case "figcaption":
var buf strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
buf.WriteString(c.Data)
}
}
caption = buf.String()
}
}

// Recurse through child nodes
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}

f(doc)

if srcPath != "" {
return elementType, srcPath, caption, true
}

return "", "", "", false
}
23 changes: 23 additions & 0 deletions utils/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ var contentTypes = map[string]string{
".jpeg": "image/jpeg",
".gif": "image/gif",
".svg": "image/svg+xml",
".mp4": "video/mp4",
".webm": "video/webm",
".ogg": "video/ogg",
".mp3": "audio/mpeg",
".wav": "audio/wav",
".pdf": "application/pdf",
".doc": "application/msword",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".xls": "application/vnd.ms-excel",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
}

func GetContentType(filename string) string {
Expand Down Expand Up @@ -159,3 +169,16 @@ func PrettyJSON(input string) (string, error) {
}
return prettyJSON.String(), nil
}

func MarshalWithoutEscape(input map[string]interface{}) (string, error) {
var buffer bytes.Buffer
encoder := json.NewEncoder(&buffer)
encoder.SetEscapeHTML(false) // Prevent escaping HTML characters

err := encoder.Encode(input)
if err != nil {
return "", fmt.Errorf("failed to marshal JSON: %v", err)
}

return buffer.String(), nil
}

0 comments on commit 8b980bb

Please sign in to comment.