diff --git a/lib/handle/emphasis.js b/lib/handle/emphasis.js index d701e9b..92be547 100644 --- a/lib/handle/emphasis.js +++ b/lib/handle/emphasis.js @@ -4,13 +4,11 @@ */ import {checkEmphasis} from '../util/check-emphasis.js' +import {encodeCharacterReference} from '../util/encode-character-reference.js' +import {encodeInfo} from '../util/encode-info.js' emphasis.peek = emphasisPeek -// To do: there are cases where emphasis cannot “form” depending on the -// previous or next character of sequences. -// There’s no way around that though, except for injecting zero-width stuff. -// Do we need to safeguard against that? /** * @param {Emphasis} node * @param {Parents | undefined} _ @@ -22,17 +20,42 @@ export function emphasis(node, _, state, info) { const marker = checkEmphasis(state) const exit = state.enter('emphasis') const tracker = state.createTracker(info) - let value = tracker.move(marker) - value += tracker.move( + const before = tracker.move(marker) + + let between = tracker.move( state.containerPhrasing(node, { - before: value, after: marker, + before, ...tracker.current() }) ) - value += tracker.move(marker) + const betweenHead = between.charCodeAt(0) + const open = encodeInfo( + info.before.charCodeAt(info.before.length - 1), + betweenHead, + marker + ) + + if (open.inside) { + between = encodeCharacterReference(betweenHead) + between.slice(1) + } + + const betweenTail = between.charCodeAt(between.length - 1) + const close = encodeInfo(info.after.charCodeAt(0), betweenTail, marker) + + if (close.inside) { + between = between.slice(0, -1) + encodeCharacterReference(betweenTail) + } + + const after = tracker.move(marker) + exit() - return value + + state.attentionEncodeSurroundingInfo = { + after: close.outside, + before: open.outside + } + return before + between + after } /** diff --git a/lib/handle/heading.js b/lib/handle/heading.js index c5afa73..e421379 100644 --- a/lib/handle/heading.js +++ b/lib/handle/heading.js @@ -3,6 +3,7 @@ * @import {Heading, Parents} from 'mdast' */ +import {encodeCharacterReference} from '../util/encode-character-reference.js' import {formatHeadingAsSetext} from '../util/format-heading-as-setext.js' /** @@ -58,11 +59,7 @@ export function heading(node, _, state, info) { if (/^[\t ]/.test(value)) { // To do: what effect has the character reference on tracking? - value = - '&#x' + - value.charCodeAt(0).toString(16).toUpperCase() + - ';' + - value.slice(1) + value = encodeCharacterReference(value.charCodeAt(0)) + value.slice(1) } value = value ? sequence + ' ' + value : sequence diff --git a/lib/handle/strong.js b/lib/handle/strong.js index 5cb3148..475d5f7 100644 --- a/lib/handle/strong.js +++ b/lib/handle/strong.js @@ -4,13 +4,11 @@ */ import {checkStrong} from '../util/check-strong.js' +import {encodeCharacterReference} from '../util/encode-character-reference.js' +import {encodeInfo} from '../util/encode-info.js' strong.peek = strongPeek -// To do: there are cases where emphasis cannot “form” depending on the -// previous or next character of sequences. -// There’s no way around that though, except for injecting zero-width stuff. -// Do we need to safeguard against that? /** * @param {Strong} node * @param {Parents | undefined} _ @@ -22,17 +20,42 @@ export function strong(node, _, state, info) { const marker = checkStrong(state) const exit = state.enter('strong') const tracker = state.createTracker(info) - let value = tracker.move(marker + marker) - value += tracker.move( + const before = tracker.move(marker + marker) + + let between = tracker.move( state.containerPhrasing(node, { - before: value, after: marker, + before, ...tracker.current() }) ) - value += tracker.move(marker + marker) + const betweenHead = between.charCodeAt(0) + const open = encodeInfo( + info.before.charCodeAt(info.before.length - 1), + betweenHead, + marker + ) + + if (open.inside) { + between = encodeCharacterReference(betweenHead) + between.slice(1) + } + + const betweenTail = between.charCodeAt(between.length - 1) + const close = encodeInfo(info.after.charCodeAt(0), betweenTail, marker) + + if (close.inside) { + between = between.slice(0, -1) + encodeCharacterReference(betweenTail) + } + + const after = tracker.move(marker + marker) + exit() - return value + + state.attentionEncodeSurroundingInfo = { + after: close.outside, + before: open.outside + } + return before + between + after } /** diff --git a/lib/index.js b/lib/index.js index 24e696e..3154b31 100644 --- a/lib/index.js +++ b/lib/index.js @@ -22,35 +22,36 @@ import {track} from './util/track.js' * * @param {Nodes} tree * Tree to serialize. - * @param {Options} [options] + * @param {Options | null | undefined} [options] * Configuration (optional). * @returns {string} * Serialized markdown representing `tree`. */ -export function toMarkdown(tree, options = {}) { +export function toMarkdown(tree, options) { + const settings = options || {} /** @type {State} */ const state = { - enter, - indentLines, associationId: association, containerPhrasing: containerPhrasingBound, containerFlow: containerFlowBound, createTracker: track, compilePattern, - safe: safeBound, - stack: [], - unsafe: [...unsafe], - join: [...join], + enter, // @ts-expect-error: GFM / frontmatter are typed in `mdast` but not defined // here. handlers: {...handlers}, - options: {}, - indexStack: [], // @ts-expect-error: add `handle` in a second. - handle: undefined + handle: undefined, + indentLines, + indexStack: [], + join: [...join], + options: {}, + safe: safeBound, + stack: [], + unsafe: [...unsafe] } - configure(state, options) + configure(state, settings) if (state.options.tightDefinitions) { state.join.push(joinDefinition) diff --git a/lib/types.d.ts b/lib/types.d.ts index b7f5ab9..4cf0157 100644 --- a/lib/types.d.ts +++ b/lib/types.d.ts @@ -392,6 +392,38 @@ export type ContainerPhrasing = (parent: PhrasingParents, info: Info) => string */ export type CreateTracker = (info: TrackFields) => Tracker +/** + * Whether to encode things — with fields representing the surrounding of a + * whole. + */ +export interface EncodeSurrounding { + /** + * Whether to encode after. + */ + after: boolean + + /** + * Whether to encode before. + */ + before: boolean +} + +/** + * Whether to encode things — with fields representing the relationship to a + * whole. + */ +export interface EncodeSides { + /** + * Whether to encode inside. + */ + inside: boolean + + /** + * Whether to encode before. + */ + outside: boolean +} + /** * Enter something. * @@ -754,6 +786,18 @@ export interface State { * Get an identifier from an association to match it to others. */ associationId: AssociationId + /** + * Info on whether to encode the surrounding of *attention*. + * + * Whether attention (emphasis, strong, strikethrough) forms + * depends on the characters inside and outside them. + * The characters inside can be handled by *attention* itself. + * However the outside characters are already handled. + * Or handled afterwards. + * This field can be used to signal from *attention* that some parent + * function (practically `containerPhrasing`) has to handle the surrounding. + */ + attentionEncodeSurroundingInfo: EncodeSurrounding | undefined /** * List marker currently in use. */ diff --git a/lib/util/container-phrasing.js b/lib/util/container-phrasing.js index bf2ddb3..999c717 100644 --- a/lib/util/container-phrasing.js +++ b/lib/util/container-phrasing.js @@ -3,6 +3,8 @@ * @import {PhrasingParents} from '../types.js' */ +import {encodeCharacterReference} from './encode-character-reference.js' + /** * Serialize the children of a parent that contains phrasing children. * @@ -24,6 +26,8 @@ export function containerPhrasing(parent, state, info) { const results = [] let index = -1 let before = info.before + /** @type {string | undefined} */ + let encodeAfter indexStack.push(-1) let tracker = state.createTracker(info) @@ -75,17 +79,43 @@ export function containerPhrasing(parent, state, info) { tracker.move(results.join('')) } - results.push( - tracker.move( - state.handle(child, parent, state, { - ...tracker.current(), - before, - after - }) - ) - ) + let value = state.handle(child, parent, state, { + ...tracker.current(), + after, + before + }) + + // If we had to encode the first character after the previous node and it’s + // still the same character, + // encode it. + if (encodeAfter && encodeAfter === value.slice(0, 1)) { + value = + encodeCharacterReference(encodeAfter.charCodeAt(0)) + value.slice(1) + } + + const encodingInfo = state.attentionEncodeSurroundingInfo + state.attentionEncodeSurroundingInfo = undefined + encodeAfter = undefined + + // If we have to encode the first character before the current node and + // it’s still the same character, + // encode it. + if (encodingInfo) { + if ( + encodingInfo.before && + before === results[results.length - 1].slice(-1) + ) { + results[results.length - 1] = + results[results.length - 1].slice(0, -1) + + encodeCharacterReference(before.charCodeAt(0)) + } + + if (encodingInfo.after) encodeAfter = after + } - before = results[results.length - 1].slice(-1) + tracker.move(value) + results.push(value) + before = value.slice(-1) } indexStack.pop() diff --git a/lib/util/encode-character-reference.js b/lib/util/encode-character-reference.js new file mode 100644 index 0000000..149d26a --- /dev/null +++ b/lib/util/encode-character-reference.js @@ -0,0 +1,11 @@ +/** + * Encode a code point as a character reference. + * + * @param {number} code + * Code point to encode. + * @returns {string} + * Encoded character reference. + */ +export function encodeCharacterReference(code) { + return '&#x' + code.toString(16).toUpperCase() + ';' +} diff --git a/lib/util/encode-info.js b/lib/util/encode-info.js new file mode 100644 index 0000000..8895a79 --- /dev/null +++ b/lib/util/encode-info.js @@ -0,0 +1,82 @@ +/** + * @import {EncodeSides} from '../types.js' + */ + +import {classifyCharacter} from 'micromark-util-classify-character' + +/** + * Check whether to encode (as a character reference) the characters + * surrounding an attention run. + * + * Which characters are around an attention run influence whether it works or + * not. + * + * See for more info. + * See this markdown in a particular renderer to see what works: + * + * ```markdown + * | | A (letter inside) | B (punctuation inside) | C (whitespace inside) | D (nothing inside) | + * | ----------------------- | ----------------- | ---------------------- | --------------------- | ------------------ | + * | 1 (letter outside) | x*y*z | x*.*z | x* *z | x**z | + * | 2 (punctuation outside) | .*y*. | .*.*. | .* *. | .**. | + * | 3 (whitespace outside) | x *y* z | x *.* z | x * * z | x ** z | + * | 4 (nothing outside) | *x* | *.* | * * | ** | + * ``` + * + * @param {number} outside + * Code point on the outer side of the run. + * @param {number} inside + * Code point on the inner side of the run. + * @param {'*' | '_'} marker + * Marker of the run. + * Underscores are handled more strictly (they form less often) than + * asterisks. + * @returns {EncodeSides} + * Whether to encode characters. + */ +// Important: punctuation must never be encoded. +// Punctuation is solely used by markdown constructs. +// And by encoding itself. +// Encoding them will break constructs or double encode things. +export function encodeInfo(outside, inside, marker) { + const outsideKind = classifyCharacter(outside) + const insideKind = classifyCharacter(inside) + + // Letter outside: + if (outsideKind === undefined) { + return insideKind === undefined + ? // Letter inside: + // we have to encode *both* letters for `_` as it is looser. + // it already forms for `*` (and GFMs `~`). + marker === '_' + ? {inside: true, outside: true} + : {inside: false, outside: false} + : insideKind === 1 + ? // Whitespace inside: encode both (letter, whitespace). + {inside: true, outside: true} + : // Punctuation inside: encode outer (letter) + {inside: false, outside: true} + } + + // Whitespace outside: + if (outsideKind === 1) { + return insideKind === undefined + ? // Letter inside: already forms. + {inside: false, outside: false} + : insideKind === 1 + ? // Whitespace inside: encode both (whitespace). + {inside: true, outside: true} + : // Punctuation inside: already forms. + {inside: false, outside: false} + } + + // Punctuation outside: + return insideKind === undefined + ? // Letter inside: already forms. + {inside: false, outside: false} + : insideKind === 1 + ? // Whitespace inside: encode inner (whitespace). + {inside: true, outside: false} + : // Punctuation inside: already forms. + {inside: false, outside: false} +} diff --git a/lib/util/safe.js b/lib/util/safe.js index 2a3a268..456fe21 100644 --- a/lib/util/safe.js +++ b/lib/util/safe.js @@ -2,6 +2,7 @@ * @import {SafeConfig, State} from 'mdast-util-to-markdown' */ +import {encodeCharacterReference} from './encode-character-reference.js' import {patternInScope} from './pattern-in-scope.js' /** @@ -118,9 +119,7 @@ export function safe(state, input, config) { result.push('\\') } else { // Character reference. - result.push( - '&#x' + value.charCodeAt(position).toString(16).toUpperCase() + ';' - ) + result.push(encodeCharacterReference(value.charCodeAt(position))) start++ } } diff --git a/package.json b/package.json index 5a34db9..e47b765 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", + "micromark-util-classify-character": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" @@ -110,6 +111,15 @@ "interface" ] } + }, + { + "files": [ + "test/**/*.js" + ], + "rules": { + "max-depth": "off", + "no-await-in-loop": "off" + } } ], "prettier": true, diff --git a/test/index.js b/test/index.js index 1564751..e47facd 100644 --- a/test/index.js +++ b/test/index.js @@ -1,6 +1,6 @@ /** * @import {Handle} from 'mdast-util-to-markdown' - * @import {BlockContent, List} from 'mdast' + * @import {BlockContent, List, PhrasingContent, Root} from 'mdast' */ import assert from 'node:assert/strict' @@ -4818,6 +4818,96 @@ a _\\__ is this emphasis? _\\__` ) }) +test('roundtrip attention', async function (t) { + /** + * @typedef Case + * @property {string} inside + * @property {(typeof markers)[number]} marker + * @property {string} outside + * @property {(typeof sides)[number]} side + * @property {(typeof types)[number]} type + */ + + const characters = ['.', ' ', 'a'] + const markers = /** @type {const} */ (['*', '_']) + const sides = /** @type {const} */ (['open', 'close']) + const types = /** @type {const} */ (['emphasis', 'strong']) + /** @type {Array} */ + const tests = [] + + for (const type of types) { + for (const marker of markers) { + for (const side of sides) { + for (const inside of characters) { + for (const outside of characters) { + tests.push({inside, marker, outside, side, type}) + } + } + } + } + } + + for (const test of tests) { + const {inside, marker, outside, side, type} = test + const name = + 'should roundtrip `' + + type + + '` using `' + + marker + + '` in an ' + + side + + ' run: ' + + (outside === '.' + ? 'punctuation' + : outside === ' ' + ? 'whitespace' + : 'letter') + + ' outside and ' + + (inside === '.' + ? 'punctuation' + : inside === ' ' + ? 'whitespace' + : 'letter') + + ' inside' + + await t.test(name, async function () { + /** @type {Array} */ + const children = [] + + if (side === 'open') { + children.push({type: 'text', value: 'x' + outside}) + } + + children.push({ + type, + children: [ + { + type: 'text', + value: + (side === 'open' ? inside : '') + + 'y' + + (side === 'close' ? inside : '') + } + ] + }) + + if (side === 'close') { + children.push({type: 'text', value: outside + 'z'}) + } + + /** @type {Root} */ + const expected = { + type: 'root', + children: [{type: 'paragraph', children}] + } + const markdown = to(expected, {emphasis: marker, strong: marker}) + const actual = from(markdown) + removePosition(actual, {force: true}) + assert.deepEqual(actual, expected) + }) + } +}) + test('position (output)', async function (t) { await t.test('should track output positions (1)', async function () { assert.equal(