diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 523b2cb82..a3d6ac1cb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ add_library(cmark latex.c man.c node.c + front_matter.c references.c render.c scanners.c diff --git a/src/blocks.c b/src/blocks.c index 292e563bb..99ba9c9ec 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -16,6 +16,7 @@ #include "cmark.h" #include "node.h" #include "references.h" +#include "front_matter.h" #include "utf8.h" #include "scanners.h" #include "inlines.h" @@ -97,6 +98,8 @@ cmark_parser *cmark_parser_new_with_mem_into_root(int options, cmark_mem *mem, c cmark_strbuf_init(mem, &parser->curline, 256); cmark_strbuf_init(mem, &parser->linebuf, 0); cmark_strbuf_init(mem, &parser->content, 0); + cmark_strbuf_init(mem, &parser->front_matter_buf, 0); + cmark_strbuf_init(mem, &parser->front_matter_info, 0); root->flags = CMARK_NODE__OPEN; @@ -133,6 +136,8 @@ void cmark_parser_free(cmark_parser *parser) { cmark_mem *mem = parser->mem; cmark_strbuf_free(&parser->curline); cmark_strbuf_free(&parser->linebuf); + cmark_strbuf_free(&parser->front_matter_buf); + cmark_strbuf_free(&parser->front_matter_info); cmark_reference_map_free(parser->refmap); mem->free(parser); } @@ -1301,6 +1306,10 @@ static void S_process_line(cmark_parser *parser, const unsigned char *buffer, parser->line_number++; + if ((parser->options & CMARK_OPT_FRONT_MATTER) && + cmark_front_matter_process_line(parser, &input)) + goto finished; + last_matched_container = check_open_blocks(parser, &input, &all_matched); if (!last_matched_container) @@ -1334,6 +1343,14 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { cmark_consolidate_text_nodes(parser->root); + // If front matter scanning was still active when the document ended, no + // closing delimiter was found. The entire document (after the opening ---) + // is treated as front matter. + if ((parser->options & CMARK_OPT_FRONT_MATTER) && parser->front_matter_scanning) + cmark_front_matter_process_line(parser, NULL); + + cmark_strbuf_free(&parser->front_matter_buf); + cmark_strbuf_free(&parser->front_matter_info); cmark_strbuf_free(&parser->curline); #if CMARK_DEBUG_NODES diff --git a/src/cmark.h b/src/cmark.h index 6626b0627..038714e91 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -46,9 +46,10 @@ typedef enum { CMARK_NODE_PARAGRAPH, CMARK_NODE_HEADING, CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_FRONT_MATTER, CMARK_NODE_FIRST_BLOCK = CMARK_NODE_DOCUMENT, - CMARK_NODE_LAST_BLOCK = CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_LAST_BLOCK = CMARK_NODE_FRONT_MATTER, /* Inline */ CMARK_NODE_TEXT, @@ -641,6 +642,13 @@ char *cmark_render_latex(cmark_node *root, int options, int width); */ #define CMARK_OPT_SMART (1 << 10) +/** Parse front matter ("---" delimited block at the start of the document) + * and expose it as a CMARK_NODE_FRONT_MATTER node. The raw content between + * the delimiters is available via cmark_node_get_literal(); how it is + * interpreted (e.g. as YAML, TOML, JSON) is left to the caller. + */ +#define CMARK_OPT_FRONT_MATTER (1 << 11) + /** * ## Version information */ diff --git a/src/commonmark.c b/src/commonmark.c index ad805a630..88364c843 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -455,6 +455,19 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, } break; + case CMARK_NODE_FRONT_MATTER: + if (entering) { + const char *info = cmark_node_get_fence_info(node); + BLANKLINE(); + LIT("---"); + if (info && *info) { LIT(" "); OUT(info, false, LITERAL); } + LIT("\n"); + OUT(cmark_node_get_literal(node), false, LITERAL); + LIT("---\n"); + BLANKLINE(); + } + break; + default: assert(false); break; diff --git a/src/front_matter.c b/src/front_matter.c new file mode 100644 index 000000000..9a7510125 --- /dev/null +++ b/src/front_matter.c @@ -0,0 +1,132 @@ +#include "front_matter.h" +#include "cmark.h" + +#include + +// --------------------------------------------------------------------------- +// Delimiter and info string parsing +// --------------------------------------------------------------------------- + +// Return true if `input` is an opening front matter delimiter: "---" followed +// by an optional info string and a newline. No leading whitespace before +// "---" is permitted. +// +// Note: some tools (e.g. Jekyll) also accept "..." as a closing delimiter, +// derived from the YAML document-end marker. We intentionally do not support +// it here because this implementation is format-agnostic — the content between +// the delimiters may be YAML, TOML, JSON, or anything else. "..." has no +// meaning outside of YAML, so "---" is the only unambiguous delimiter. +static bool is_opening_delimiter(cmark_chunk *input) { + const unsigned char *p = input->data; + return input->len >= 3 && p[0] == '-' && p[1] == '-' && p[2] == '-'; +} + +// Return true if `input` is a closing front matter delimiter: exactly "---" +// with optional trailing whitespace then a newline. An info string is not +// permitted on the closing delimiter. +static bool is_closing_delimiter(cmark_chunk *input) { + const unsigned char *p = input->data; + int len = input->len; + + if (len < 3 || !(p[0] == '-' && p[1] == '-' && p[2] == '-')) + return false; + + for (int i = 3; i < len; i++) { + if (p[i] == '\n' || p[i] == '\r') + return true; + if (p[i] != ' ' && p[i] != '\t') + return false; + } + return true; +} + +// Extract the optional info string from an opening delimiter line, e.g. +// "--- yaml\n" yields "yaml". Returns a zero-length chunk if absent. +static cmark_chunk parse_info(cmark_chunk *input) { + const unsigned char *p = input->data + 3; + int len = input->len - 3; + + while (len > 0 && (*p == ' ' || *p == '\t')) { p++; len--; } + while (len > 0 && (p[len-1] == '\n' || p[len-1] == '\r' || + p[len-1] == ' ' || p[len-1] == '\t')) + len--; + + return (cmark_chunk){ .data = (unsigned char *)p, .len = (bufsize_t)len }; +} + +// --------------------------------------------------------------------------- +// Node creation +// --------------------------------------------------------------------------- + +static void create_front_matter_node(cmark_parser *parser) { + cmark_node *node = + cmark_node_new_with_mem(CMARK_NODE_FRONT_MATTER, parser->mem); + + // Store identically to a code block: info string + literal content. + cmark_node_set_fence_info(node, + parser->front_matter_info.size > 0 + ? (const char *)parser->front_matter_info.ptr + : ""); + + cmark_node_set_literal(node, + parser->front_matter_buf.size > 0 + ? (const char *)parser->front_matter_buf.ptr + : ""); + + node->start_line = 1; + node->start_column = 1; + node->end_line = parser->line_number; + node->end_column = 3; + + cmark_node *first = cmark_node_first_child(parser->root); + if (first) + cmark_node_insert_before(first, node); + else + cmark_node_append_child(parser->root, node); + + parser->front_matter_scanning = false; + cmark_strbuf_clear(&parser->front_matter_buf); + cmark_strbuf_clear(&parser->front_matter_info); +} + +// --------------------------------------------------------------------------- +// State machine — called from S_process_line in blocks.c +// --------------------------------------------------------------------------- + +bool cmark_front_matter_process_line(cmark_parser *parser, cmark_chunk *input) { + // NULL signals end-of-document: the whole document is the front matter. + if (input == NULL) { + create_front_matter_node(parser); + return true; + } + + // Adjust for any offset already consumed (e.g. a UTF-8 BOM on line 1). + cmark_chunk adjusted = { + .data = input->data + parser->offset, + .len = input->len - parser->offset, + }; + input = &adjusted; + + if (parser->line_number == 1) { + if (is_opening_delimiter(input)) { + parser->front_matter_scanning = true; + // Capture optional info string (e.g. "yaml" from "--- yaml\n"). + cmark_chunk info = parse_info(input); + if (info.len > 0) + cmark_strbuf_put(&parser->front_matter_info, info.data, info.len); + } + return parser->front_matter_scanning; + } + + if (!parser->front_matter_scanning) + return false; + + if (is_closing_delimiter(input)) { + create_front_matter_node(parser); + return true; + } + + // Accumulate this content line. + cmark_strbuf_put(&parser->front_matter_buf, input->data, input->len); + return true; +} diff --git a/src/front_matter.h b/src/front_matter.h new file mode 100644 index 000000000..007dbbb4a --- /dev/null +++ b/src/front_matter.h @@ -0,0 +1,24 @@ +#ifndef CMARK_FRONT_MATTER_H +#define CMARK_FRONT_MATTER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "cmark.h" +#include "parser.h" +#include "chunk.h" + +// Called from S_process_line in blocks.c for every line when +// CMARK_OPT_FRONT_MATTER is set. Drives the front matter state machine +// stored directly on the parser (front_matter_scanning / front_matter_buf). +// +// Returns true if the line was consumed by the front matter scanner and +// should not be passed to the normal block parser. +bool cmark_front_matter_process_line(cmark_parser *parser, cmark_chunk *input); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/html.c b/src/html.c index 5c14fa6f8..791172223 100644 --- a/src/html.c +++ b/src/html.c @@ -317,6 +317,9 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, } break; + case CMARK_NODE_FRONT_MATTER: + break; + default: assert(false); break; diff --git a/src/latex.c b/src/latex.c index 386c14ff5..f254937ca 100644 --- a/src/latex.c +++ b/src/latex.c @@ -443,6 +443,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, } break; + case CMARK_NODE_FRONT_MATTER: + break; + default: assert(false); break; diff --git a/src/main.c b/src/main.c index 9a43cceda..d03908b66 100644 --- a/src/main.c +++ b/src/main.c @@ -40,6 +40,7 @@ void print_usage(void) { printf(" --safe Omit raw HTML and dangerous URLs\n"); printf(" --unsafe Render raw HTML and dangerous URLs\n"); printf(" --smart Use smart punctuation\n"); + printf(" --front-matter Parse front matter (--- delimited block at start of document)\n"); printf(" --validate-utf8 Replace invalid UTF-8 sequences with U+FFFD\n"); printf(" --help, -h Print usage information\n"); printf(" --version Print version\n"); @@ -112,6 +113,8 @@ int main(int argc, char *argv[]) { options |= CMARK_OPT_NOBREAKS; } else if (strcmp(argv[i], "--smart") == 0) { options |= CMARK_OPT_SMART; + } else if (strcmp(argv[i], "--front-matter") == 0) { + options |= CMARK_OPT_FRONT_MATTER; } else if (strcmp(argv[i], "--safe") == 0) { options |= CMARK_OPT_SAFE; } else if (strcmp(argv[i], "--unsafe") == 0) { diff --git a/src/man.c b/src/man.c index 02dfb7cb3..bc580d196 100644 --- a/src/man.c +++ b/src/man.c @@ -268,6 +268,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, } break; + case CMARK_NODE_FRONT_MATTER: + break; + default: assert(false); break; diff --git a/src/node.c b/src/node.c index c1492545e..565a76b7e 100644 --- a/src/node.c +++ b/src/node.c @@ -124,6 +124,7 @@ static void S_free_nodes(cmark_node *e) { while (e != NULL) { switch (e->type) { case CMARK_NODE_CODE_BLOCK: + case CMARK_NODE_FRONT_MATTER: mem->free(e->data); mem->free(e->as.code.info); break; @@ -199,6 +200,8 @@ const char *cmark_node_get_type_string(cmark_node *node) { return "heading"; case CMARK_NODE_THEMATIC_BREAK: return "thematic_break"; + case CMARK_NODE_FRONT_MATTER: + return "front_matter"; case CMARK_NODE_TEXT: return "text"; case CMARK_NODE_SOFTBREAK: @@ -311,6 +314,7 @@ const char *cmark_node_get_literal(cmark_node *node) { case CMARK_NODE_HTML_INLINE: case CMARK_NODE_CODE: case CMARK_NODE_CODE_BLOCK: + case CMARK_NODE_FRONT_MATTER: return node->data ? (char *)node->data : ""; default: @@ -331,6 +335,7 @@ int cmark_node_set_literal(cmark_node *node, const char *content) { case CMARK_NODE_HTML_INLINE: case CMARK_NODE_CODE: case CMARK_NODE_CODE_BLOCK: + case CMARK_NODE_FRONT_MATTER: node->len = cmark_set_cstr(node->mem, &node->data, content); return 1; @@ -487,7 +492,8 @@ const char *cmark_node_get_fence_info(cmark_node *node) { return NULL; } - if (node->type == CMARK_NODE_CODE_BLOCK) { + if (node->type == CMARK_NODE_CODE_BLOCK || + node->type == CMARK_NODE_FRONT_MATTER) { return node->as.code.info ? (char *)node->as.code.info : ""; } else { return NULL; @@ -499,7 +505,8 @@ int cmark_node_set_fence_info(cmark_node *node, const char *info) { return 0; } - if (node->type == CMARK_NODE_CODE_BLOCK) { + if (node->type == CMARK_NODE_CODE_BLOCK || + node->type == CMARK_NODE_FRONT_MATTER) { cmark_set_cstr(node->mem, &node->as.code.info, info); return 1; } else { diff --git a/src/parser.h b/src/parser.h index f546ace11..97b8aa3fa 100644 --- a/src/parser.h +++ b/src/parser.h @@ -33,6 +33,24 @@ struct cmark_parser { int options; bool last_buffer_ended_with_cr; unsigned int total_size; + + /* Front matter scanning state (CMARK_OPT_FRONT_MATTER). + * + * cmark_front_matter_process_line() is called from S_process_line() in + * blocks.c immediately after parser->line_number is incremented, so the + * first line of the document arrives with line_number == 1. The function + * relies on this: it uses line_number == 1 as the trigger to decide + * whether the document opens with a front matter block. + * + * front_matter_scanning is set to true when a valid opening "---" is seen + * on line 1 and remains true until the matching closing "---" is found or + * the document ends. While scanning, each content line is accumulated in + * front_matter_buf. Both fields are freed explicitly in + * cmark_parser_finish() and cmark_parser_free(). + */ + bool front_matter_scanning; + cmark_strbuf front_matter_buf; /* accumulated content lines */ + cmark_strbuf front_matter_info; /* optional format hint from opening "--- " */ }; #ifdef __cplusplus diff --git a/src/xml.c b/src/xml.c index 2ca2de82c..eb823a12a 100644 --- a/src/xml.c +++ b/src/xml.c @@ -115,6 +115,18 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_DOCUMENT: cmark_strbuf_puts(xml, " xmlns=\"http://commonmark.org/xml/1.0\""); break; + case CMARK_NODE_FRONT_MATTER: + if (node->as.code.info) { + cmark_strbuf_puts(xml, " info=\""); + escape_xml_str(xml, node->as.code.info); + cmark_strbuf_putc(xml, '"'); + } + cmark_strbuf_puts(xml, " xml:space=\"preserve\">"); + escape_xml(xml, node->data, node->len); + cmark_strbuf_puts(xml, "") + add_test(NAME front_matter_executable + COMMAND "$" "${CMAKE_CURRENT_SOURCE_DIR}/spec_tests.py" + --no-normalize + --spec "${CMAKE_CURRENT_SOURCE_DIR}/front_matter.txt" + --program "$ --front-matter") + ELSE(Python3_Interpreter_FOUND) message(WARNING "A Python 3 Interpreter is required to run the spec tests") diff --git a/test/front_matter.txt b/test/front_matter.txt new file mode 100644 index 000000000..b40c5ec56 --- /dev/null +++ b/test/front_matter.txt @@ -0,0 +1,144 @@ +# Front Matter + +Front matter is an optional metadata block at the very start of a document, +delimited by `---` on its own line. It is enabled with the `--front-matter` +flag (or `CMARK_OPT_FRONT_MATTER` in the C API). The raw content between the +delimiters is exposed as a `CMARK_NODE_FRONT_MATTER` node; how it is +interpreted is left to the caller. + +All examples in this file are run with `--front-matter` enabled. + +## Basic front matter + +A document that opens with `---` followed by a closing `---` produces a front +matter node. The front matter does not appear in the HTML output. + +```````````````````````````````` example +--- +title: Hello +--- +# Body +. +

Body

+```````````````````````````````` + +Front matter with no body produces no HTML output. + +```````````````````````````````` example +--- +title: Hello +--- +. +```````````````````````````````` + +## Info string + +An optional info string after the opening `---` describes the format of the +content (e.g. `yaml`, `toml`, `json`). It does not affect the HTML output. + +```````````````````````````````` example +--- yaml +title: Hello +--- +# Body +. +

Body

+```````````````````````````````` + +Info strings with no space between `---` and the format name are also accepted. + +```````````````````````````````` example +---yaml +title: Hello +--- +# Body +. +

Body

+```````````````````````````````` + +## Empty front matter + +An empty front matter block (opening and closing `---` on consecutive lines) +is valid and produces no HTML output. + +```````````````````````````````` example +--- +--- +# Body +. +

Body

+```````````````````````````````` + +## No closing delimiter + +If no closing `---` is found, the entire document (after the opening `---`) is +treated as front matter. Nothing is rendered to HTML. + +```````````````````````````````` example +--- +title: Hello +# Not a heading +. +```````````````````````````````` + +## Not front matter + +Front matter is only recognised when `---` is the very first line. A `---` +elsewhere in the document is a thematic break. + +```````````````````````````````` example +# Heading + +--- +title: Not front matter +--- +. +

Heading

+
+

title: Not front matter

+```````````````````````````````` + +## Trailing whitespace on delimiter + +The closing delimiter may have trailing whitespace. + +```````````````````````````````` example +--- +title: Hello +--- +# Body +. +

Body

+```````````````````````````````` + +## Front matter content is not parsed as Markdown + +Block-level Markdown syntax inside the front matter block is not interpreted; +it is captured as raw text. + +```````````````````````````````` example +--- +# not a heading +**not bold** +- not a list +--- +# Body +. +

Body

+```````````````````````````````` + +## `...` is not a closing delimiter + +Unlike some tools (e.g. Jekyll), `...` is not treated as a closing delimiter. +This implementation is format-agnostic; `...` is a YAML-specific convention +with no meaning for other formats. + +```````````````````````````````` example +--- +title: Hello +... +--- +# Body +. +

Body

+````````````````````````````````