im going to bed -=-
This commit is contained in:
2026-02-12 02:28:23 +02:00
parent 0b92f6f239
commit a5d75e6bac
1972 changed files with 308880 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
__all__ = (
"StateCore",
"block",
"inline",
"linkify",
"normalize",
"replace",
"smartquotes",
"text_join",
)
from .block import block
from .inline import inline
from .linkify import linkify
from .normalize import normalize
from .replacements import replace
from .smartquotes import smartquotes
from .state_core import StateCore
from .text_join import text_join

View File

@@ -0,0 +1,13 @@
from ..token import Token
from .state_core import StateCore
def block(state: StateCore) -> None:
if state.inlineMode:
token = Token("inline", "", 0)
token.content = state.src
token.map = [0, 1]
token.children = []
state.tokens.append(token)
else:
state.md.block.parse(state.src, state.md, state.env, state.tokens)

View File

@@ -0,0 +1,10 @@
from .state_core import StateCore
def inline(state: StateCore) -> None:
"""Parse inlines"""
for token in state.tokens:
if token.type == "inline":
if token.children is None:
token.children = []
state.md.inline.parse(token.content, state.md, state.env, token.children)

View File

@@ -0,0 +1,149 @@
from __future__ import annotations
import re
from typing import Protocol
from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen
from ..token import Token
from .state_core import StateCore
HTTP_RE = re.compile(r"^http://")
MAILTO_RE = re.compile(r"^mailto:")
TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
def linkify(state: StateCore) -> None:
"""Rule for identifying plain-text links."""
if not state.md.options.linkify:
return
if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")
for inline_token in state.tokens:
if inline_token.type != "inline" or not state.md.linkify.pretest(
inline_token.content
):
continue
tokens = inline_token.children
htmlLinkLevel = 0
# We scan from the end, to keep position when new tags added.
# Use reversed logic in links start/end match
assert tokens is not None
i = len(tokens)
while i >= 1:
i -= 1
assert isinstance(tokens, list)
currentToken = tokens[i]
# Skip content of markdown links
if currentToken.type == "link_close":
i -= 1
while (
tokens[i].level != currentToken.level
and tokens[i].type != "link_open"
):
i -= 1
continue
# Skip content of html tag links
if currentToken.type == "html_inline":
if isLinkOpen(currentToken.content) and htmlLinkLevel > 0:
htmlLinkLevel -= 1
if isLinkClose(currentToken.content):
htmlLinkLevel += 1
if htmlLinkLevel > 0:
continue
if currentToken.type == "text" and state.md.linkify.test(
currentToken.content
):
text = currentToken.content
links: list[_LinkType] = state.md.linkify.match(text) or []
# Now split string to nodes
nodes = []
level = currentToken.level
lastPos = 0
# forbid escape sequence at the start of the string,
# this avoids http\://example.com/ from being linkified as
# http:<a href="//example.com/">//example.com/</a>
if (
links
and links[0].index == 0
and i > 0
and tokens[i - 1].type == "text_special"
):
links = links[1:]
for link in links:
url = link.url
fullUrl = state.md.normalizeLink(url)
if not state.md.validateLink(fullUrl):
continue
urlText = link.text
# Linkifier might send raw hostnames like "example.com", where url
# starts with domain name. So we prepend http:// in those cases,
# and remove it afterwards.
if not link.schema:
urlText = HTTP_RE.sub(
"", state.md.normalizeLinkText("http://" + urlText)
)
elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText):
urlText = MAILTO_RE.sub(
"", state.md.normalizeLinkText("mailto:" + urlText)
)
else:
urlText = state.md.normalizeLinkText(urlText)
pos = link.index
if pos > lastPos:
token = Token("text", "", 0)
token.content = text[lastPos:pos]
token.level = level
nodes.append(token)
token = Token("link_open", "a", 1)
token.attrs = {"href": fullUrl}
token.level = level
level += 1
token.markup = "linkify"
token.info = "auto"
nodes.append(token)
token = Token("text", "", 0)
token.content = urlText
token.level = level
nodes.append(token)
token = Token("link_close", "a", -1)
level -= 1
token.level = level
token.markup = "linkify"
token.info = "auto"
nodes.append(token)
lastPos = link.last_index
if lastPos < len(text):
token = Token("text", "", 0)
token.content = text[lastPos:]
token.level = level
nodes.append(token)
inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes)
class _LinkType(Protocol):
url: str
text: str
index: int
last_index: int
schema: str | None

View File

@@ -0,0 +1,19 @@
"""Normalize input string."""
import re
from .state_core import StateCore
# https://spec.commonmark.org/0.29/#line-ending
NEWLINES_RE = re.compile(r"\r\n?|\n")
NULL_RE = re.compile(r"\0")
def normalize(state: StateCore) -> None:
# Normalize newlines
string = NEWLINES_RE.sub("\n", state.src)
# Replace NULL characters
string = NULL_RE.sub("\ufffd", string)
state.src = string

View File

@@ -0,0 +1,127 @@
"""Simple typographic replacements
* ``(c)``, ``(C)`` → ©
* ``(tm)``, ``(TM)`` → ™
* ``(r)``, ``(R)`` → ®
* ``+-`` → ±
* ``...`` → …
* ``?....`` → ?..
* ``!....`` → !..
* ``????????`` → ???
* ``!!!!!`` → !!!
* ``,,,`` → ,
* ``--`` → &ndash
* ``---`` → &mdash
"""
from __future__ import annotations
import logging
import re
from ..token import Token
from .state_core import StateCore
LOGGER = logging.getLogger(__name__)
# TODO:
# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
# - multiplication 2 x 4 -> 2 × 4
RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--")
# Workaround for phantomjs - need regex without /g flag,
# or root check will fail every second time
# SCOPED_ABBR_TEST_RE = r"\((c|tm|r)\)"
SCOPED_ABBR_RE = re.compile(r"\((c|tm|r)\)", flags=re.IGNORECASE)
PLUS_MINUS_RE = re.compile(r"\+-")
ELLIPSIS_RE = re.compile(r"\.{2,}")
ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…")
QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}")
COMMA_RE = re.compile(r",{2,}")
EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE)
EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE)
EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE)
SCOPED_ABBR = {"c": "©", "r": "®", "tm": ""}
def replaceFn(match: re.Match[str]) -> str:
return SCOPED_ABBR[match.group(1).lower()]
def replace_scoped(inlineTokens: list[Token]) -> None:
inside_autolink = 0
for token in inlineTokens:
if token.type == "text" and not inside_autolink:
token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content)
if token.type == "link_open" and token.info == "auto":
inside_autolink -= 1
if token.type == "link_close" and token.info == "auto":
inside_autolink += 1
def replace_rare(inlineTokens: list[Token]) -> None:
inside_autolink = 0
for token in inlineTokens:
if (
token.type == "text"
and (not inside_autolink)
and RARE_RE.search(token.content)
):
# +- -> ±
token.content = PLUS_MINUS_RE.sub("±", token.content)
# .., ..., ....... -> …
token.content = ELLIPSIS_RE.sub("", token.content)
# but ?..... & !..... -> ?.. & !..
token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub("\\1..", token.content)
token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content)
# ,, ,,, ,,,, -> ,
token.content = COMMA_RE.sub(",", token.content)
# em-dash
token.content = EM_DASH_RE.sub("\\1\u2014", token.content)
# en-dash
token.content = EN_DASH_RE.sub("\\1\u2013", token.content)
token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content)
if token.type == "link_open" and token.info == "auto":
inside_autolink -= 1
if token.type == "link_close" and token.info == "auto":
inside_autolink += 1
def replace(state: StateCore) -> None:
if not state.md.options.typographer:
return
for token in state.tokens:
if token.type != "inline":
continue
if token.children is None:
continue
if SCOPED_ABBR_RE.search(token.content):
replace_scoped(token.children)
if RARE_RE.search(token.content):
replace_rare(token.children)

View File

@@ -0,0 +1,202 @@
"""Convert straight quotation marks to typographic ones"""
from __future__ import annotations
import re
from typing import Any
from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace
from ..token import Token
from .state_core import StateCore
QUOTE_TEST_RE = re.compile(r"['\"]")
QUOTE_RE = re.compile(r"['\"]")
APOSTROPHE = "\u2019" #
def replaceAt(string: str, index: int, ch: str) -> str:
# When the index is negative, the behavior is different from the js version.
# But basically, the index will not be negative.
assert index >= 0
return string[:index] + ch + string[index + 1 :]
def process_inlines(tokens: list[Token], state: StateCore) -> None:
stack: list[dict[str, Any]] = []
for i, token in enumerate(tokens):
thisLevel = token.level
j = 0
for j in range(len(stack))[::-1]:
if stack[j]["level"] <= thisLevel:
break
else:
# When the loop is terminated without a "break".
# Subtract 1 to get the same index as the js version.
j -= 1
stack = stack[: j + 1]
if token.type != "text":
continue
text = token.content
pos = 0
maximum = len(text)
while pos < maximum:
goto_outer = False
lastIndex = pos
t = QUOTE_RE.search(text[lastIndex:])
if not t:
break
canOpen = canClose = True
pos = t.start(0) + lastIndex + 1
isSingle = t.group(0) == "'"
# Find previous character,
# default to space if it's the beginning of the line
lastChar: None | int = 0x20
if t.start(0) + lastIndex - 1 >= 0:
lastChar = charCodeAt(text, t.start(0) + lastIndex - 1)
else:
for j in range(i)[::-1]:
if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
break
# should skip all tokens except 'text', 'html_inline' or 'code_inline'
if not tokens[j].content:
continue
lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1)
break
# Find next character,
# default to space if it's the end of the line
nextChar: None | int = 0x20
if pos < maximum:
nextChar = charCodeAt(text, pos)
else:
for j in range(i + 1, len(tokens)):
# nextChar defaults to 0x20
if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
break
# should skip all tokens except 'text', 'html_inline' or 'code_inline'
if not tokens[j].content:
continue
nextChar = charCodeAt(tokens[j].content, 0)
break
isLastPunctChar = lastChar is not None and (
isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar))
)
isNextPunctChar = nextChar is not None and (
isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar))
)
isLastWhiteSpace = lastChar is not None and isWhiteSpace(lastChar)
isNextWhiteSpace = nextChar is not None and isWhiteSpace(nextChar)
if isNextWhiteSpace: # noqa: SIM114
canOpen = False
elif isNextPunctChar and not (isLastWhiteSpace or isLastPunctChar):
canOpen = False
if isLastWhiteSpace: # noqa: SIM114
canClose = False
elif isLastPunctChar and not (isNextWhiteSpace or isNextPunctChar):
canClose = False
if nextChar == 0x22 and t.group(0) == '"': # 0x22: " # noqa: SIM102
if (
lastChar is not None and lastChar >= 0x30 and lastChar <= 0x39
): # 0x30: 0, 0x39: 9
# special case: 1"" - count first quote as an inch
canClose = canOpen = False
if canOpen and canClose:
# Replace quotes in the middle of punctuation sequence, but not
# in the middle of the words, i.e.:
#
# 1. foo " bar " baz - not replaced
# 2. foo-"-bar-"-baz - replaced
# 3. foo"bar"baz - not replaced
canOpen = isLastPunctChar
canClose = isNextPunctChar
if not canOpen and not canClose:
# middle of word
if isSingle:
token.content = replaceAt(
token.content, t.start(0) + lastIndex, APOSTROPHE
)
continue
if canClose:
# this could be a closing quote, rewind the stack to get a match
for j in range(len(stack))[::-1]:
item = stack[j]
if stack[j]["level"] < thisLevel:
break
if item["single"] == isSingle and stack[j]["level"] == thisLevel:
item = stack[j]
if isSingle:
openQuote = state.md.options.quotes[2]
closeQuote = state.md.options.quotes[3]
else:
openQuote = state.md.options.quotes[0]
closeQuote = state.md.options.quotes[1]
# replace token.content *before* tokens[item.token].content,
# because, if they are pointing at the same token, replaceAt
# could mess up indices when quote length != 1
token.content = replaceAt(
token.content, t.start(0) + lastIndex, closeQuote
)
tokens[item["token"]].content = replaceAt(
tokens[item["token"]].content, item["pos"], openQuote
)
pos += len(closeQuote) - 1
if item["token"] == i:
pos += len(openQuote) - 1
text = token.content
maximum = len(text)
stack = stack[:j]
goto_outer = True
break
if goto_outer:
goto_outer = False
continue
if canOpen:
stack.append(
{
"token": i,
"pos": t.start(0) + lastIndex,
"single": isSingle,
"level": thisLevel,
}
)
elif canClose and isSingle:
token.content = replaceAt(
token.content, t.start(0) + lastIndex, APOSTROPHE
)
def smartquotes(state: StateCore) -> None:
if not state.md.options.typographer:
return
for token in state.tokens:
if token.type != "inline" or not QUOTE_RE.search(token.content):
continue
if token.children is not None:
process_inlines(token.children, state)

View File

@@ -0,0 +1,25 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from ..ruler import StateBase
from ..token import Token
from ..utils import EnvType
if TYPE_CHECKING:
from markdown_it import MarkdownIt
class StateCore(StateBase):
def __init__(
self,
src: str,
md: MarkdownIt,
env: EnvType,
tokens: list[Token] | None = None,
) -> None:
self.src = src
self.md = md # link to parser instance
self.env = env
self.tokens: list[Token] = tokens or []
self.inlineMode = False

View File

@@ -0,0 +1,35 @@
"""Join raw text tokens with the rest of the text
This is set as a separate rule to provide an opportunity for plugins
to run text replacements after text join, but before escape join.
For example, `\\:)` shouldn't be replaced with an emoji.
"""
from __future__ import annotations
from ..token import Token
from .state_core import StateCore
def text_join(state: StateCore) -> None:
"""Join raw text for escape sequences (`text_special`) tokens with the rest of the text"""
for inline_token in state.tokens[:]:
if inline_token.type != "inline":
continue
# convert text_special to text and join all adjacent text nodes
new_tokens: list[Token] = []
for child_token in inline_token.children or []:
if child_token.type == "text_special":
child_token.type = "text"
if (
child_token.type == "text"
and new_tokens
and new_tokens[-1].type == "text"
):
new_tokens[-1].content += child_token.content
else:
new_tokens.append(child_token)
inline_token.children = new_tokens