Source code for icalendar.parser.content_line

"""parsing and generation of content lines"""

import re

from icalendar.parser.parameter import Parameters
from icalendar.parser.property import unescape_backslash, unescape_list_or_string
from icalendar.parser.string import (
    _escape_string,
    _unescape_string,
    foldline,
    validate_token,
)
from icalendar.parser_tools import DEFAULT_ENCODING, ICAL_TYPE, to_unicode

UFOLD = re.compile("(\r?\n)+[ \t]")
NEWLINE = re.compile(r"\r?\n")

OWS = " \t"
OWS_AROUND_DELIMITERS_RE = re.compile(r"[ \t]*([;=])[ \t]*")


def _strip_ows_around_delimiters(st: str, delimiters: str = ";=") -> str:
    """Strip optional whitespace around delimiters outside of quoted sections,
    respecting backslash escapes so that escaped delimiters are not treated as
    separators.

    This is a lenient parsing helper (used when strict=False) to support
    iCalendar content lines that contain extra whitespace around tokens.
    """
    if not st:
        return st

    # Fast path for the common case in non-strict mode:
    # no whitespace in the parameter section means there is nothing to normalize.
    if " " not in st and "\t" not in st:
        return st

    # Fast regex-based path for simple parameter sections without quoting/escaping.
    if delimiters == ";=" and '"' not in st and "\\" not in st:
        return OWS_AROUND_DELIMITERS_RE.sub(r"\1", st).strip()

    out: list[str] = []
    pending_ws: list[str] = []
    in_quotes = False
    escaped = False
    # True only if the last appended char was a raw delimiter.
    last_was_delimiter = False

    def flush_pending() -> None:
        nonlocal pending_ws
        if not pending_ws:
            return
        if not last_was_delimiter:
            out.extend(pending_ws)
        pending_ws.clear()

    for ch in st:
        # Handle escaped character (the backslash set escaped in previous iteration)
        if escaped:
            flush_pending()
            out.append(ch)
            escaped = False
            last_was_delimiter = False
            continue

        # Handle backslash to escape next character
        if ch == "\\" and not in_quotes:
            flush_pending()
            out.append(ch)
            escaped = True
            last_was_delimiter = False
            continue

        # Handle quote toggling
        if ch == '"' and not escaped:
            in_quotes = not in_quotes
            flush_pending()
            out.append(ch)
            last_was_delimiter = False
            continue

        # Whitespace outside quotes is buffered
        if not in_quotes and not escaped and ch in OWS:
            pending_ws.append(ch)
            continue

        # Raw delimiter (unescaped and outside quotes)
        if not in_quotes and not escaped and ch in delimiters:
            pending_ws.clear()
            while out and out[-1] in OWS:
                out.pop()
            out.append(ch)
            last_was_delimiter = True
            continue

        # Regular character
        flush_pending()
        out.append(ch)
        last_was_delimiter = False

    if pending_ws and not last_was_delimiter:
        out.extend(pending_ws)

    return "".join(out).strip()


[docs] class Contentline(str): """A content line is basically a string that can be folded and parsed into parts. """ __slots__ = ("strict",) def __new__(cls, value, strict=False, encoding=DEFAULT_ENCODING): value = to_unicode(value, encoding=encoding) assert "\n" not in value, ( "Content line can not contain unescaped new line characters." ) self = super().__new__(cls, value) self.strict = strict return self
[docs] @classmethod def from_parts( cls, name: ICAL_TYPE, params: Parameters, values, sorted: bool = True, # noqa: A002 ): """Turn a parts into a content line.""" assert isinstance(params, Parameters) if hasattr(values, "to_ical"): values = values.to_ical() else: from icalendar.prop import vText values = vText(values).to_ical() # elif isinstance(values, basestring): # values = escape_char(values) # TODO: after unicode only, remove this # Convert back to unicode, after to_ical encoded it. name = to_unicode(name) values = to_unicode(values) if params: params = to_unicode(params.to_ical(sorted=sorted)) if params: # some parameter values can be skipped during serialization return cls(f"{name};{params}:{values}") return cls(f"{name}:{values}")
[docs] def parts(self) -> tuple[str, Parameters, str]: """Split the content line into ``name``, ``parameters``, and ``values`` parts. Properly handles escaping with backslashes and double-quote sections to avoid corrupting URL-encoded characters in values. Example with parameter: .. code-block:: ics DESCRIPTION;ALTREP="cid:part1.0001@example.org":The Fall'98 Wild Example without parameters: .. code-block:: ics DESCRIPTION:The Fall'98 Wild """ try: name_split: int | None = None value_split: int | None = None in_quotes: bool = False escaped: bool = False for i, ch in enumerate(self): if ch == '"' and not escaped: in_quotes = not in_quotes elif ch == "\\" and not in_quotes: escaped = True continue elif not in_quotes and not escaped: # Find first delimiter for name if ch in ":;" and name_split is None: name_split = i # Find value delimiter (first colon) if ch == ":" and value_split is None: value_split = i escaped = False # Validate parsing results if not value_split: # No colon found - value is empty, use end of string value_split = len(self) # Extract name - if no delimiter, # take whole string for validate_token to reject name = self[:name_split] if name_split else self if not self.strict: name = re.sub(r"[ \t]+", "", name.strip()) validate_token(name) if not name_split or name_split + 1 == value_split: # No delimiter or empty parameter section raise ValueError("Invalid content line") # noqa: TRY301 # Parse parameters - they still need to be escaped/unescaped # for proper handling of commas, semicolons, etc. in parameter values raw_param_str = self[name_split + 1 : value_split] if not self.strict: raw_param_str = _strip_ows_around_delimiters(raw_param_str) param_str = _escape_string(raw_param_str) params = Parameters.from_ical(param_str, strict=self.strict) params = Parameters( (_unescape_string(key), unescape_list_or_string(value)) for key, value in iter(params.items()) ) # Unescape backslash sequences in values but preserve URL encoding values = unescape_backslash(self[value_split + 1 :]) except ValueError as exc: raise ValueError( f"Content line could not be parsed into parts: '{self}': {exc}" ) from exc return (name, params, values)
[docs] @classmethod def from_ical(cls, ical, strict=False): """Unfold the content lines in an iCalendar into long content lines.""" ical = to_unicode(ical) # a fold is carriage return followed by either a space or a tab return cls(UFOLD.sub("", ical), strict=strict)
[docs] def to_ical(self): """Long content lines are folded so they are less than 75 characters wide. """ return foldline(self).encode(DEFAULT_ENCODING)
[docs] class Contentlines(list[Contentline]): """I assume that iCalendar files generally are a few kilobytes in size. Then this should be efficient. for Huge files, an iterator should probably be used instead. """
[docs] def to_ical(self): """Simply join self.""" return b"\r\n".join(line.to_ical() for line in self if line) + b"\r\n"
[docs] @classmethod def from_ical(cls, st): """Parses a string into content lines.""" st = to_unicode(st) try: # a fold is carriage return followed by either a space or a tab unfolded = UFOLD.sub("", st) lines = cls(Contentline(line) for line in NEWLINE.split(unfolded) if line) lines.append("") # '\r\n' at the end of every content line except Exception as e: raise ValueError("Expected StringType with content lines") from e return lines
__all__ = ["Contentline", "Contentlines"]