From 64a8769b5149167eef4d9ba9ce86c2e5985a902d Mon Sep 17 00:00:00 2001
From: Jeremy Daer ]*>(.*?) ]*>(.*?) " + convertInline(text) + " " + strings.Join(parts, " ") + " " + html.EscapeString(md) + "
between
blank-line-separated blocks for Trix paragraph spacing
- trixRenderer: compact blockquote output, HTML escaping for raw
HTML blocks (vs ), TrixBreak and
EscapedAt node rendering
- escapedAtParser: intercepts \@ before goldmark's standard escape
handling to preserve mention-suppression syntax
HTMLToMarkdown gains
-aware round-trip support: blockquote and
list handlers now normalize
tags before splitting, so multiline
blockquotes and list continuations survive the edit loop
(MarkdownToHTML → HTMLToMarkdown) faithfully. Also fixes multiline
blockquote/paragraph regex matching ((?i) → (?is)) and trims
trailing newlines from code fence content.
Promotes goldmark from indirect to direct dependency (v1.7.13, already
present via glamour).
---
go.mod | 2 +-
internal/richtext/richtext.go | 877 ++++++++++++++---------------
internal/richtext/richtext_test.go | 387 ++++++++++++-
3 files changed, 789 insertions(+), 477 deletions(-)
diff --git a/go.mod b/go.mod
index 97ef1cf2..957cd115 100644
--- a/go.mod
+++ b/go.mod
@@ -19,6 +19,7 @@ require (
github.com/spf13/cobra v1.10.2
github.com/spf13/pflag v1.0.10
github.com/stretchr/testify v1.11.1
+ github.com/yuin/goldmark v1.7.13
github.com/zalando/go-keyring v0.2.8
golang.org/x/mod v0.34.0
golang.org/x/sys v0.42.0
@@ -70,7 +71,6 @@ require (
github.com/rivo/uniseg v0.4.7 // indirect
github.com/rogpeppe/go-internal v1.14.1 // indirect
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
- github.com/yuin/goldmark v1.7.13 // indirect
github.com/yuin/goldmark-emoji v1.0.6 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/sync v0.20.0 // indirect
diff --git a/internal/richtext/richtext.go b/internal/richtext/richtext.go
index 04f3f210..db59b711 100644
--- a/internal/richtext/richtext.go
+++ b/internal/richtext/richtext.go
@@ -3,6 +3,7 @@
package richtext
import (
+ "bytes"
"errors"
"fmt"
"html"
@@ -15,40 +16,18 @@ import (
"unicode/utf8"
"github.com/charmbracelet/glamour"
+ "github.com/yuin/goldmark"
+ "github.com/yuin/goldmark/ast"
+ "github.com/yuin/goldmark/extension"
+ "github.com/yuin/goldmark/parser"
+ "github.com/yuin/goldmark/renderer"
+ gmhtml "github.com/yuin/goldmark/renderer/html"
+ "github.com/yuin/goldmark/text"
+ "github.com/yuin/goldmark/util"
)
-// Pre-compiled regexes for MarkdownToHTML list detection
-var (
- ulPattern = regexp.MustCompile(`^(\s*)[-*+]\s+(.*)$`)
- olPattern = regexp.MustCompile(`^(\s*)\d+\.\s+(.*)$`)
-)
-
-// CommonMark §2.4: any ASCII punctuation may be backslash-escaped.
-// Exact set: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
-//
-// We intentionally omit @ from the set: in Basecamp context \@ is the
-// idiomatic way to suppress a mention ping, so it must pass through
-// literally and not be unescaped into a bare @ that ResolveMentions
-// would convert into a ]*>(.*?)
`)
reH5 = regexp.MustCompile(`(?i)]*>(.*?)
`)
reH6 = regexp.MustCompile(`(?i)]*>(.*?)
`)
- reBlockquote = regexp.MustCompile(`(?i)]*>(.*?)
`)
+ reBlockquote = regexp.MustCompile(`(?is)]*>(.*?)
`)
reCodeBlock = regexp.MustCompile(`(?is)]*>
`)
reCodeLang = regexp.MustCompile(`class="language-([^"]*)"`)
reCodeInner = regexp.MustCompile(`(?is)]*(?:class="language-([^"]*)")?[^>]*>(.*?)]*>([\s\S]*?)`)
- reUL = regexp.MustCompile(`(?is)]*>(.*?)
`)
- reOL = regexp.MustCompile(`(?is)]*>(.*?)
`)
- reLI = regexp.MustCompile(`(?is)
`)
reHR = regexp.MustCompile(`(?i)
`)
)
@@ -131,355 +107,229 @@ var reMarkdownPatterns = []*regexp.Regexp{
regexp.MustCompile(`^>\s`),
}
-// MarkdownToHTML converts Markdown text to HTML suitable for Basecamp's rich text fields.
-// It handles common Markdown syntax: headings, bold, italic, links, lists, code blocks, and blockquotes.
-// If the input already appears to be HTML, it is returned unchanged to preserve existing formatting.
-func MarkdownToHTML(md string) string {
- if md == "" {
- return ""
- }
-
- // If input is already HTML, return unchanged to preserve existing content
- if IsHTML(md) {
- return md
- }
-
- // Normalize line endings
- md = strings.ReplaceAll(md, "\r\n", "\n")
- md = strings.ReplaceAll(md, "\r", "\n")
-
- var result strings.Builder
- lines := strings.Split(md, "\n")
-
- var inCodeBlock bool
- var codeBlockLang string
- var codeLines []string
- var inList bool
- var listItems []string
- var listType string // "ul" or "ol"
- var pendingBreak bool
- var paraLines []string
-
- flushPendingBreak := func() {
- if pendingBreak {
- result.WriteString("
\n")
- pendingBreak = false
- }
- }
+// mdConverter is the goldmark Markdown-to-HTML converter configured for Trix compatibility.
+var mdConverter = goldmark.New(
+ goldmark.WithExtensions(extension.Strikethrough),
+ goldmark.WithRendererOptions(gmhtml.WithUnsafe()),
+ goldmark.WithParserOptions(
+ parser.WithInlineParsers(
+ util.Prioritized(&escapedAtParser{}, 900),
+ ),
+ parser.WithASTTransformers(
+ util.Prioritized(&trixTransformer{}, 100),
+ ),
+ ),
+ goldmark.WithRendererOptions(
+ renderer.WithNodeRenderers(
+ util.Prioritized(&trixRenderer{}, 500),
+ ),
+ ),
+)
- flushParagraph := func() {
- if len(paraLines) > 0 {
- flushPendingBreak()
- text := strings.Join(paraLines, " ")
- result.WriteString("
\n for Trix paragraph spacing.
+type TrixBreak struct{ ast.BaseBlock }
- flushList := func() {
- if len(listItems) > 0 {
- result.WriteString("<" + listType + ">\n")
- for _, item := range listItems {
- result.WriteString("
\n")
- } else {
- result.WriteString("" + code + "
\n")
- }
- inCodeBlock = false
- codeLines = nil
- codeBlockLang = ""
- } else {
- // Start code block
- flushParagraph()
- flushList()
- flushPendingBreak()
- inCodeBlock = true
- codeBlockLang = after
- }
- continue
- }
+func (n *TrixBreak) Kind() ast.NodeKind { return KindTrixBreak }
+func (n *TrixBreak) Dump(source []byte, level int) { ast.DumpHelper(n, source, level, nil, nil) }
- if inCodeBlock {
- codeLines = append(codeLines, line)
- continue
- }
+// EscapedAt is a custom inline node that renders as literal \@.
+type EscapedAt struct{ ast.BaseInline }
- // Check for list items (using precompiled regexes)
- ulMatch := ulPattern.FindStringSubmatch(line)
- olMatch := olPattern.FindStringSubmatch(line)
-
- if ulMatch != nil {
- flushParagraph()
- if !inList || listType != "ul" {
- flushList()
- flushPendingBreak()
- inList = true
- listType = "ul"
- }
- pendingBreak = false // blank was between items, not after the list
- listItems = append(listItems, convertInline(ulMatch[2]))
- continue
- }
+// KindEscapedAt is the node kind for EscapedAt.
+var KindEscapedAt = ast.NewNodeKind("EscapedAt")
- if olMatch != nil {
- flushParagraph()
- if !inList || listType != "ol" {
- flushList()
- flushPendingBreak()
- inList = true
- listType = "ol"
- }
- pendingBreak = false // blank was between items, not after the list
- listItems = append(listItems, convertInline(olMatch[2]))
- continue
- }
+func (n *EscapedAt) Kind() ast.NodeKind { return KindEscapedAt }
+func (n *EscapedAt) Dump(source []byte, level int) { ast.DumpHelper(n, source, level, nil, nil) }
- // Empty line - handle differently based on context
- if strings.TrimSpace(line) == "" {
- if inList {
- // In a list: empty lines between items create spacing but don't break the list.
- // Record pending break so content after the list gets proper separation.
- pendingBreak = true
- continue
- }
- // Not in a list: flush paragraph and record break
- flushParagraph()
- if result.Len() > 0 {
- pendingBreak = true
- }
- continue
- }
+// escapedAtParser intercepts \@ before goldmark's standard backslash escape handling.
+type escapedAtParser struct{}
- // Check for list continuation lines (indented text that continues previous list item)
- if inList && len(listItems) > 0 {
- // Check if line is indented (starts with spaces or tabs)
- if strings.HasPrefix(line, " ") || strings.HasPrefix(line, "\t") {
- // This is a continuation of the last list item
- trimmedLine := strings.TrimSpace(line)
- // Append to last list item with " + code + "
separator
- lastItemIndex := len(listItems) - 1
- listItems[lastItemIndex] = listItems[lastItemIndex] + "
\n" + convertInline(trimmedLine)
- pendingBreak = false // blank was before continuation, not after the list
- continue
- }
- }
+func (p *escapedAtParser) Trigger() []byte { return []byte{'\\'} }
- // Not a list item or continuation, flush any pending list
- flushList()
+func (p *escapedAtParser) Parse(_ ast.Node, block text.Reader, _ parser.Context) ast.Node {
+ line, _ := block.PeekLine()
+ if len(line) < 2 || line[0] != '\\' || line[1] != '@' {
+ return nil
+ }
+ block.Advance(2)
+ return &EscapedAt{}
+}
- // Headings
- if strings.HasPrefix(line, "#") {
- flushParagraph()
- flushPendingBreak()
- }
- if after, ok := strings.CutPrefix(line, "######"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
- if after, ok := strings.CutPrefix(line, "#####"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
- if after, ok := strings.CutPrefix(line, "####"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
- if after, ok := strings.CutPrefix(line, "###"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
- if after, ok := strings.CutPrefix(line, "##"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
- if after, ok := strings.CutPrefix(line, "#"); ok {
- result.WriteString("" + convertInline(strings.TrimSpace(after)) + "
\n")
- continue
- }
+// trixTransformer modifies the AST for Trix-compatible HTML output.
+type trixTransformer struct{}
- // Blockquote
- if strings.HasPrefix(line, ">") {
- flushParagraph()
- flushPendingBreak()
+func (t *trixTransformer) Transform(node *ast.Document, reader text.Reader, pc parser.Context) {
+ // Phase 1: Force tight lists, convert soft breaks to hard in list items,
+ // and unwrap blockquote paragraphs
+ _ = ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
}
- if after, ok := strings.CutPrefix(line, ">"); ok {
- quote := strings.TrimSpace(after)
- result.WriteString("" + convertInline(quote) + "
\n")
- continue
+ switch v := n.(type) {
+ case *ast.List:
+ v.IsTight = true
+ for li := v.FirstChild(); li != nil; li = li.NextSibling() {
+ replaceParagraphsWithTextBlocks(li)
+ convertSoftBreaksToHard(li)
+ }
+ case *ast.Blockquote:
+ replaceParagraphsWithTextBlocks(v)
+ convertSoftBreaksToHard(v)
+ insertBreaksBetweenTextBlocks(v)
}
+ return ast.WalkContinue, nil
+ })
- // Horizontal rule
- trimmed := strings.TrimSpace(line)
- if len(trimmed) >= 3 && (allChars(trimmed, '-') || allChars(trimmed, '*') || allChars(trimmed, '_')) {
- flushParagraph()
- flushPendingBreak()
- result.WriteString("
\n")
- continue
+ // Phase 2: Insert TrixBreak nodes before blank-line-separated top-level blocks
+ for child := node.FirstChild(); child != nil; child = child.NextSibling() {
+ if child.HasBlankPreviousLines() && child.PreviousSibling() != nil {
+ br := &TrixBreak{}
+ node.InsertBefore(node, child, br)
}
-
- // Accumulate paragraph lines
- paraLines = append(paraLines, line)
}
+}
- // Flush any remaining paragraph or list
- flushParagraph()
- flushList()
-
- // Handle unclosed code block
- if inCodeBlock && len(codeLines) > 0 {
- code := strings.Join(codeLines, "\n")
- code = escapeHTML(code)
- result.WriteString("
\n")
+func replaceParagraphsWithTextBlocks(parent ast.Node) {
+ for child := parent.FirstChild(); child != nil; {
+ next := child.NextSibling()
+ if p, ok := child.(*ast.Paragraph); ok {
+ tb := ast.NewTextBlock()
+ for gc := p.FirstChild(); gc != nil; {
+ gnext := gc.NextSibling()
+ tb.AppendChild(tb, gc)
+ gc = gnext
+ }
+ tb.SetLines(p.Lines())
+ parent.ReplaceChild(parent, p, tb)
+ }
+ child = next
}
-
- return strings.TrimSpace(result.String())
}
-// convertInline converts inline Markdown elements (bold, italic, links, code) to HTML.
-// Code spans and backslash escapes are protected from further processing to preserve
-// their literal content.
-func convertInline(text string) string {
- // Protect escaped backticks before code-span detection so \` remains literal
- // and cannot be interpreted as a code-span delimiter.
- var escapedBackticks []string
- text = reEscapedBacktick.ReplaceAllStringFunc(text, func(_ string) string {
- idx := len(escapedBackticks)
- escapedBackticks = append(escapedBackticks, "`")
- return "\x00ESCBT" + strconv.Itoa(idx) + "\x00"
- })
-
- // Extract code spans — their content must be completely literal.
- var codeSpans []string
- text = reCodeSpan.ReplaceAllStringFunc(text, func(match string) string {
- inner := reCodeSpan.FindStringSubmatch(match)
- if len(inner) >= 2 {
- idx := len(codeSpans)
- codeSpans = append(codeSpans, inner[1])
- return "\x00CODE" + strconv.Itoa(idx) + "\x00"
+func convertSoftBreaksToHard(parent ast.Node) {
+ _ = ast.Walk(parent, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
}
- return match
- })
-
- // Process backslash escapes (CommonMark §2.4): a backslash before an ASCII
- // punctuation character produces the literal character. We extract these into
- // placeholders so they are not treated as Markdown delimiters and restore
- // them afterward. We use attribute-safe escaping on restore because escaped
- // punctuation can be captured inside href/src values before link/image HTML is built.
- var escaped []string
- text = reBackslashEscape.ReplaceAllStringFunc(text, func(match string) string {
- idx := len(escaped)
- escaped = append(escaped, match[1:]) // the punctuation character after the backslash
- return "\x00ESC" + strconv.Itoa(idx) + "\x00"
+ if t, ok := n.(*ast.Text); ok && t.SoftLineBreak() {
+ t.SetSoftLineBreak(false)
+ t.SetHardLineBreak(true)
+ }
+ return ast.WalkContinue, nil
})
+}
- // Escape HTML entities
- text = escapeHTML(text)
-
- // Bold with ** or __
- text = reBoldStar.ReplaceAllString(text, "$1")
- text = reBoldUnder.ReplaceAllString(text, "$1")
-
- // Italic with * or _ (but not inside words for _)
- text = reItalicStar.ReplaceAllString(text, "$1")
- text = reItalicUnder.ReplaceAllStringFunc(text, func(s string) string {
- inner := reItalicInner.FindStringSubmatch(s)
- if len(inner) >= 2 {
- prefix := ""
- suffix := ""
- if len(s) > 0 && s[0] != '_' {
- prefix = string(s[0])
- }
- if len(s) > 0 && s[len(s)-1] != '_' {
- suffix = string(s[len(s)-1])
+func insertBreaksBetweenTextBlocks(parent ast.Node) {
+ for child := parent.FirstChild(); child != nil; child = child.NextSibling() {
+ if _, ok := child.(*ast.TextBlock); ok {
+ if next := child.NextSibling(); next != nil {
+ if _, ok := next.(*ast.TextBlock); ok {
+ br := &TrixBreak{}
+ parent.InsertAfter(parent, child, br)
+ }
}
- return prefix + "" + inner[1] + "" + suffix
}
- return s
- })
+ }
+}
- // Images  - MUST come before links since image syntax contains link syntax
- text = reImage.ReplaceAllStringFunc(text, func(match string) string {
- parts := reImage.FindStringSubmatch(match)
- if len(parts) >= 3 {
- alt := escapeAttr(parts[1])
- src := resolveDestinationEscapes(parts[2], escaped, escapedBackticks)
- src = escapeAttr(src)
- return `" + code + "`
- }
- return match
- })
+// trixRenderer provides custom rendering for Trix-compatible HTML output.
+type trixRenderer struct{}
- // Links [text](url)
- text = reLink.ReplaceAllStringFunc(text, func(match string) string {
- parts := reLink.FindStringSubmatch(match)
- if len(parts) >= 3 {
- linkText := parts[1]
- href := resolveDestinationEscapes(parts[2], escaped, escapedBackticks)
- href = escapeAttr(href)
- return `` + linkText + ``
- }
- return match
- })
+func (r *trixRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) {
+ reg.Register(ast.KindRawHTML, r.renderRawHTML)
+ reg.Register(ast.KindHTMLBlock, r.renderHTMLBlock)
+ reg.Register(ast.KindBlockquote, r.renderBlockquote)
+ reg.Register(KindTrixBreak, r.renderTrixBreak)
+ reg.Register(KindEscapedAt, r.renderEscapedAt)
+}
- // Strikethrough ~~text~~
- text = reStrikethrough.ReplaceAllString(text, "
$1")
+func (r *trixRenderer) renderBlockquote(w util.BufWriter, _ []byte, _ ast.Node, entering bool) (ast.WalkStatus, error) {
+ if entering {
+ _, _ = w.WriteString("")
+ } else {
+ _, _ = w.WriteString("
\n")
+ }
+ return ast.WalkContinue, nil
+}
- // Restore backslash-escaped characters in body text. Placeholders inside
- // link/image destinations were already resolved with percent-encoding above.
- escapedRendered := make([]string, len(escaped))
- for i, ch := range escaped {
- escapedRendered[i] = escapeAttr(ch)
+func (r *trixRenderer) renderRawHTML(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
+ }
+ n, ok := node.(*ast.RawHTML)
+ if !ok {
+ return ast.WalkContinue, nil
}
- text = restorePlaceholders(text, "ESC", escapedRendered)
- text = restorePlaceholders(text, "ESCBT", escapedRenderedBackticks(escapedBackticks))
-
- // Restore code spans (HTML-escape their content since extraction now
- // happens before escapeHTML to allow backslash-escape processing).
- codeRendered := make([]string, len(codeSpans))
- for i, code := range codeSpans {
- codeRendered[i] = "" + escapeHTML(code) + ""
+ for i := 0; i < n.Segments.Len(); i++ {
+ seg := n.Segments.At(i)
+ _, _ = w.Write(util.EscapeHTML(seg.Value(source)))
}
- text = restorePlaceholders(text, "CODE", codeRendered)
+ return ast.WalkContinue, nil
+}
- return text
+func (r *trixRenderer) renderHTMLBlock(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
+ }
+ n, ok := node.(*ast.HTMLBlock)
+ if !ok {
+ return ast.WalkContinue, nil
+ }
+ lines := n.Lines()
+ parts := make([]string, 0, lines.Len()+1)
+ for i := 0; i < lines.Len(); i++ {
+ seg := lines.At(i)
+ escaped := strings.TrimRight(string(util.EscapeHTML(seg.Value(source))), "\n")
+ parts = append(parts, escaped)
+ }
+ if n.HasClosure() {
+ escaped := strings.TrimRight(string(util.EscapeHTML(n.ClosureLine.Value(source))), "\n")
+ parts = append(parts, escaped)
+ }
+ _, _ = w.WriteString("
\n")
+ return ast.WalkContinue, nil
}
-func restorePlaceholders(text, prefix string, replacements []string) string {
- if len(replacements) == 0 {
- return text
+func (r *trixRenderer) renderEscapedAt(w util.BufWriter, _ []byte, _ ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
}
- pairs := make([]string, 0, len(replacements)*2)
- for i, repl := range replacements {
- pairs = append(pairs, "\x00"+prefix+strconv.Itoa(i)+"\x00", repl)
+ _, _ = w.WriteString(`\@`)
+ return ast.WalkContinue, nil
+}
+
+// MarkdownToHTML converts Markdown text to HTML suitable for Basecamp's rich text fields.
+// It uses goldmark with custom AST transformations for Trix editor compatibility.
+// If the input already appears to be HTML, it is returned unchanged to preserve existing formatting.
+func MarkdownToHTML(md string) string {
+ if md == "" {
+ return ""
+ }
+
+ if IsHTML(md) {
+ return md
+ }
+
+ md = strings.ReplaceAll(md, "\r\n", "\n")
+ md = strings.ReplaceAll(md, "\r", "\n")
+
+ var buf bytes.Buffer
+ if err := mdConverter.Convert([]byte(md), &buf); err != nil {
+ return "
tag followed by an optional newline, collapsing
+// the pair to a single \n. goldmark's hard-break output is
\n; Trix API
+// content may have standalone
.
+var reBRLine = regexp.MustCompile(`(?i)
\n?`)
+
+// formatListItem converts a list item's HTML content to Markdown, handling
+//
tags as indented continuation lines.
+func formatListItem(prefix, indent, content string) string {
+ content = strings.TrimSpace(content)
+ content = reBRLine.ReplaceAllString(content, "\n")
+ lines := strings.Split(content, "\n")
+ var parts []string
+ for i, line := range lines {
+ if i == 0 {
+ parts = append(parts, prefix+strings.TrimSpace(line))
+ } else {
+ // Preserve existing indentation from nested list conversion
+ parts = append(parts, indent+line)
+ }
+ }
+ return strings.Join(parts, "\n")
+}
+
+// convertCodeBlockHTML converts a
match to Markdown.
+// Entities are left escaped so that later regex passes (reP, reStripTags) don't
+// corrupt code content like <p>. The global unescapeHTML at the end of
+// HTMLToMarkdown converts them.
+func convertCodeBlockHTML(s string) string {
+ langMatch := reCodeLang.FindStringSubmatch(s)
+ lang := ""
+ if len(langMatch) >= 2 {
+ lang = langMatch[1]
+ }
+ codeMatch := reCodeInner.FindStringSubmatch(s)
+ if len(codeMatch) >= 2 {
+ code := strings.TrimSuffix(codeMatch[1], "\n")
+ return "```" + lang + "\n" + code + "\n```"
+ }
+ return s
+}
+
+// reLIOpen matches an opening .../
inside items are recursively converted to Markdown.
+func extractListItems(html string) []string {
+ var items []string
+ i := 0
+ for {
+ // Find next top-level
/
") == len("")
+ depth--
+ if depth == 0 {
+ inner := html[contentStart:j]
+ var md string
+ if tag == "ul" {
+ md = convertULInner(inner)
+ } else {
+ md = convertOLInner(inner)
+ }
+ s := result.String()
+ if len(s) > 0 && s[len(s)-1] != '\n' {
+ result.WriteByte('\n')
+ }
+ result.WriteString(md + "\n\n")
+ j += closeLen
+ break
+ }
+ j += closeLen
+ } else if loc := reListOpen.FindStringSubmatchIndex(html[j:]); loc != nil && loc[0] == 0 {
+ depth++
+ j += loc[1]
+ } else {
+ j++
+ }
+ }
+ if depth > 0 {
+ // Unclosed tag — write original text
+ result.WriteString(html[matchStart:])
+ break
+ }
+ i = j
+ }
+ return result.String()
+}
+
+// convertULInner converts inner blocks by tracking tag
+// depth and converts each to Markdown. Handles nesting correctly where regex
+// lazy/greedy matching cannot.
+func replaceBalancedListBlocks(html string) string {
+ var result strings.Builder
+ i := 0
+ for {
+ loc := reListOpen.FindStringSubmatchIndex(html[i:])
+ if loc == nil {
+ result.WriteString(html[i:])
+ break
+ }
+ matchStart := i + loc[0]
+ tag := strings.ToLower(html[i+loc[2] : i+loc[3]]) // "ul" or "ol"
+ contentStart := i + loc[1]
+
+ result.WriteString(html[i:matchStart])
+
+ depth := 1
+ j := contentStart
+ for j < len(html) && depth > 0 {
+ // Decrement for any list close tag (handles mixed
") {
+ closeLen := 5 // len("/
") || hasPrefixFold(html[j:], " nesting)
+ if hasPrefixFold(html[j:], "
content (between
and
) to Markdown.
+func convertULInner(inner string) string {
+ items := extractListItems(inner)
+ result := make([]string, 0, len(items))
+ for _, content := range items {
+ result = append(result, formatListItem("- ", " ", content))
+ }
+ return strings.Join(result, "\n")
+}
+
+// convertOLInner converts inner content (between
and
) to Markdown.
+func convertOLInner(inner string) string {
+ items := extractListItems(inner)
+ result := make([]string, 0, len(items))
+ for i, content := range items {
+ prefix := strconv.Itoa(i+1) + ". "
+ indent := strings.Repeat(" ", len(prefix))
+ result = append(result, formatListItem(prefix, indent, content))
+ }
+ return strings.Join(result, "\n")
+}
+
+// blockquoteInnerToMarkdown converts the inner HTML of a blockquote to Markdown,
+// handling nested block elements (lists, code blocks) before line-level operations.
+func blockquoteInnerToMarkdown(inner string) string {
+ content := strings.TrimSpace(inner)
+ content = reCodeBlock.ReplaceAllStringFunc(content, func(s string) string {
+ return convertCodeBlockHTML(s) + "\n\n"
+ })
+ content = replaceBalancedListBlocks(content)
+ // Replace
openers. Two passes so
para1
para2
produces + // "para1\n\npara2" (blank line = > separator) rather than "para1para2". + content = reClosingP.ReplaceAllString(content, "\n\n") + content = reOpeningP.ReplaceAllString(content, "") + content = reBRLine.ReplaceAllString(content, "\n") + content = reMultiNewline.ReplaceAllString(content, "\n\n") + return strings.TrimSpace(content) +} + +var ( + reOpeningP = regexp.MustCompile(`(?i)]*)?>`) + reClosingP = regexp.MustCompile(`(?i)
`) +) + // unescapeHTML converts HTML entities back to their characters. func unescapeHTML(s string) string { s = strings.ReplaceAll(s, "&", "&") @@ -934,7 +885,7 @@ func resolveMentionAnchors(html string, lookupByID PersonByIDFunc) (string, erro switch scheme { case "mention": // Zero API calls — use value as SGID, link text as display name (caller-trusted). - // Unescape HTML because convertInline already escaped the link text (e.g. & → &) + // Unescape HTML because goldmark already escaped the link text (e.g. & → &) // and MentionToHTML will re-escape — without this we'd double-encode. name := unescapeHTML(strings.TrimPrefix(displayText, "@")) tag = MentionToHTML(value, name) diff --git a/internal/richtext/richtext_test.go b/internal/richtext/richtext_test.go index 42796be7..fa9393b0 100644 --- a/internal/richtext/richtext_test.go +++ b/internal/richtext/richtext_test.go @@ -80,7 +80,7 @@ func TestMarkdownToHTML(t *testing.T) { { name: "ordered list with trailing spaces and descriptions", input: "1. **Item** - [Link](url) (time) \n Description here\n\n2. **Next** - [Link](url)", - expected: "", + expected: "", }, { name: "list followed by blank line then paragraph", @@ -88,9 +88,12 @@ func TestMarkdownToHTML(t *testing.T) { expected: "Following paragraph.
", }, { - name: "blank between list items does not leak break after list", + // CommonMark §5.4: "After" is a lazy continuation of the second list item. + // goldmark treats non-indented continuation lines as part of the list item, + // unlike our previous hand-rolled parser which ended the list. + name: "lazy continuation stays in list item", input: "- One\n\n- Two\nAfter", - expected: "After
", + expected: "func main() {}`,
+ expected: "func main() {}\n",
},
{
name: "code block without language",
input: "```\nsome code\n```",
- expected: "some code",
+ expected: "some code\n",
},
{
name: "horizontal rule with dashes",
@@ -150,7 +153,7 @@ func TestMarkdownToHTML(t *testing.T) {
{
name: "consecutive lines join into one paragraph",
input: "Line one\nLine two",
- expected: "Line one Line two
", + expected: "Line one\nLine two
", }, { name: "blank line before list", @@ -160,7 +163,7 @@ func TestMarkdownToHTML(t *testing.T) { { name: "blank line before code block", input: "Intro\n\n```\ncode\n```", - expected: "Intro
\ncode",
+ expected: "Intro
\ncode\n",
},
{
name: "leading blank lines ignored",
@@ -195,12 +198,13 @@ func TestMarkdownToHTML(t *testing.T) {
{
name: "code fence flushes accumulated paragraph",
input: "Text\n```go\nx\n```",
- expected: "Text
\nx",
+ expected: "Text
\nx\n",
},
{
- name: "horizontal rule flushes accumulated paragraph",
+ // CommonMark: "Text\n---" is a setext heading (h2), not paragraph + hr
+ name: "setext heading level 2",
input: "Text\n---",
- expected: "Text
\nintro
\n<div>hello</div>",
+ expected: "intro
\n<div>hello</div>\n",
},
}
@@ -301,9 +305,10 @@ func TestMarkdownToHTMLBackslashEscapes(t *testing.T) {
expected: ``,
},
{
+ // goldmark treats \% as literal % in URLs (CommonMark spec)
name: "escaped percent in link destination",
input: `[x](https://example.com/\%20)`,
- expected: ``,
+ expected: ``,
},
{
name: "escaped backslash in link destination",
@@ -323,7 +328,7 @@ func TestMarkdownToHTMLBackslashEscapes(t *testing.T) {
{
name: "escaped percent in image src",
input: ``,
- expected: `

\@John
`, + }, + { + name: "double backslash at", + input: `\\@John`, + expected: `\@John
`, + }, + { + name: "triple backslash at", + input: `\\\@John`, + expected: `\\@John
`, + }, + { + name: "quadruple backslash at", + input: `\\\\@John`, + expected: `\\@John
`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MarkdownToHTML(tt.input) + if result != tt.expected { + t.Errorf("MarkdownToHTML(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestMarkdownToHTMLMultiParagraphBlockquote(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "single line", + input: "> text", + expected: "text", + }, + { + name: "multiline", + input: "> line1\n> line2", + expected: "
line1", + }, + { + name: "multi-paragraph", + input: "> para1\n>\n> para2", + expected: "
\nline2
para1\n", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MarkdownToHTML(tt.input) + if result != tt.expected { + t.Errorf("MarkdownToHTML(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestMarkdownToHTMLRawHTMLBlock(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "single-line script tag", + input: "", + expected: "
\npara2
<script>alert(1)</script>
", + }, + { + name: "multiline script tag", + input: "", + expected: "<script> alert(1) </script>
", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MarkdownToHTML(tt.input) + if result != tt.expected { + t.Errorf("MarkdownToHTML(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestHTMLToMarkdownMultilineBlockquote(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "single paragraph", + input: "\n", + expected: "> text", + }, + { + name: "adjacent paragraphs", + input: "text
\n
", + expected: "> para1\n>\n> para2", + }, + { + name: "paragraph then list", + input: "para1
para2
", + expected: "> intro\n>\n> - one\n> - two", + }, + { + name: "paragraph then code block", + input: "intro
- one
- two
", + expected: "> intro\n>\n> ```\n> code\n> ```", + }, + { + name: "code block then paragraph", + input: "intro
code
", + expected: "> ```\n> code\n> ```\n>\n> tail", + }, + { + name: "code block then nested blockquote", + input: "codetail
", + expected: "> ```\n> code\n> ```\n>\n> > nested", + }, + { + name: "whitespace-separated paragraphs", + input: "codenested
\n", + expected: "> para1\n>\n> para2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if result != tt.expected { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestHTMLToMarkdownMultilineParagraph(t *testing.T) { + input := "para1
\npara2
\n
line1\nline2
" + result := HTMLToMarkdown(input) + if !strings.Contains(result, "line1") || !strings.Contains(result, "line2") { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nmissing content", input, result) + } +} + +func TestHTMLToMarkdownCodeFenceNewline(t *testing.T) { + input := "func main() {}\n"
+ result := HTMLToMarkdown(input)
+ if strings.Contains(result, "\n\n```") {
+ t.Errorf("HTMLToMarkdown(%q) has extra blank line before closing fence\ngot: %q", input, result)
+ }
+ if !strings.Contains(result, "func main() {}") {
+ t.Errorf("HTMLToMarkdown(%q) missing code content\ngot: %q", input, result)
+ }
+}
+
+func TestHTMLToMarkdownCodePreservesHTMLEntities(t *testing.T) {
+ tests := []struct {
+ name string
+ input string
+ contains string
+ }{
+ {
+ name: "p tags in code block survive reP and reStripTags",
+ input: "<p>\nhi\n</p>\n",
+ contains: "\nhi\n
", + }, + { + name: "div tags in code block survive reStripTags", + input: "<div>hello</div>",
+ contains: "", + contains: "<p>\nhi\n</p>\n
\n> hi\n>
", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if !strings.Contains(result, tt.contains) { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nmissing: %q", tt.input, result, tt.contains) + } + }) + } +} + +func TestHTMLToMarkdownNestedLists(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "nested ul compact", + input: "", + expected: "> > nested", + }, + { + name: "sibling lists preserved", + input: "nested
text