Третий коммит, добавление share, share_kb, а также ADMIN_ID

This commit is contained in:
2025-07-22 13:50:14 +03:00
parent 849feb7beb
commit b98123f4dc
1479 changed files with 323549 additions and 11 deletions

View File

@@ -0,0 +1,14 @@
from ._query import Query, QueryVariable, SimpleQuery
from ._url import URL, cache_clear, cache_configure, cache_info
__version__ = "1.20.1"
__all__ = (
"URL",
"SimpleQuery",
"QueryVariable",
"Query",
"cache_clear",
"cache_configure",
"cache_info",
)

View File

@@ -0,0 +1,203 @@
"""URL parsing utilities."""
import re
import unicodedata
from functools import lru_cache
from typing import Union
from urllib.parse import scheme_chars, uses_netloc
from ._quoters import QUOTER, UNQUOTER_PLUS
# Leading and trailing C0 control and space to be stripped per WHATWG spec.
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
WHATWG_C0_CONTROL_OR_SPACE = (
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
)
# Unsafe bytes to be removed per WHATWG spec
UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
USES_AUTHORITY = frozenset(uses_netloc)
SplitURLType = tuple[str, str, str, str, str]
def split_url(url: str) -> SplitURLType:
"""Split URL into parts."""
# Adapted from urllib.parse.urlsplit
# Only lstrip url as some applications rely on preserving trailing space.
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
for b in UNSAFE_URL_BYTES_TO_REMOVE:
if b in url:
url = url.replace(b, "")
scheme = netloc = query = fragment = ""
i = url.find(":")
if i > 0 and url[0] in scheme_chars:
for c in url[1:i]:
if c not in scheme_chars:
break
else:
scheme, url = url[:i].lower(), url[i + 1 :]
has_hash = "#" in url
has_question_mark = "?" in url
if url[:2] == "//":
delim = len(url) # position of end of domain part of url, default is end
if has_hash and has_question_mark:
delim_chars = "/?#"
elif has_question_mark:
delim_chars = "/?"
elif has_hash:
delim_chars = "/#"
else:
delim_chars = "/"
for c in delim_chars: # look for delimiters; the order is NOT important
wdelim = url.find(c, 2) # find first of this delim
if wdelim >= 0 and wdelim < delim: # if found
delim = wdelim # use earliest delim position
netloc = url[2:delim]
url = url[delim:]
has_left_bracket = "[" in netloc
has_right_bracket = "]" in netloc
if (has_left_bracket and not has_right_bracket) or (
has_right_bracket and not has_left_bracket
):
raise ValueError("Invalid IPv6 URL")
if has_left_bracket:
bracketed_host = netloc.partition("[")[2].partition("]")[0]
# Valid bracketed hosts are defined in
# https://www.rfc-editor.org/rfc/rfc3986#page-49
# https://url.spec.whatwg.org/
if bracketed_host and bracketed_host[0] == "v":
if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
raise ValueError("IPvFuture address is invalid")
elif ":" not in bracketed_host:
raise ValueError("The IPv6 content between brackets is not valid")
if has_hash:
url, _, fragment = url.partition("#")
if has_question_mark:
url, _, query = url.partition("?")
if netloc and not netloc.isascii():
_check_netloc(netloc)
return scheme, netloc, url, query, fragment
def _check_netloc(netloc: str) -> None:
# Adapted from urllib.parse._checknetloc
# looking for characters like \u2100 that expand to 'a/c'
# IDNA uses NFKC equivalence, so normalize for this check
# ignore characters already included
# but not the surrounding text
n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
normalized_netloc = unicodedata.normalize("NFKC", n)
if n == normalized_netloc:
return
# Note that there are no unicode decompositions for the character '@' so
# its currently impossible to have test coverage for this branch, however if the
# one should be added in the future we want to make sure its still checked.
for c in "/?#@:": # pragma: no branch
if c in normalized_netloc:
raise ValueError(
f"netloc '{netloc}' contains invalid "
"characters under NFKC normalization"
)
@lru_cache # match the same size as urlsplit
def split_netloc(
netloc: str,
) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]:
"""Split netloc into username, password, host and port."""
if "@" not in netloc:
username: Union[str, None] = None
password: Union[str, None] = None
hostinfo = netloc
else:
userinfo, _, hostinfo = netloc.rpartition("@")
username, have_password, password = userinfo.partition(":")
if not have_password:
password = None
if "[" in hostinfo:
_, _, bracketed = hostinfo.partition("[")
hostname, _, port_str = bracketed.partition("]")
_, _, port_str = port_str.partition(":")
else:
hostname, _, port_str = hostinfo.partition(":")
if not port_str:
return username or None, password, hostname or None, None
try:
port = int(port_str)
except ValueError:
raise ValueError("Invalid URL: port can't be converted to integer")
if not (0 <= port <= 65535):
raise ValueError("Port out of range 0-65535")
return username or None, password, hostname or None, port
def unsplit_result(
scheme: str, netloc: str, url: str, query: str, fragment: str
) -> str:
"""Unsplit a URL without any normalization."""
if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
if url and url[:1] != "/":
url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"
else:
url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"
elif scheme:
url = f"{scheme}:{url}"
if query:
url = f"{url}?{query}"
return f"{url}#{fragment}" if fragment else url
@lru_cache # match the same size as urlsplit
def make_netloc(
user: Union[str, None],
password: Union[str, None],
host: Union[str, None],
port: Union[int, None],
encode: bool = False,
) -> str:
"""Make netloc from parts.
The user and password are encoded if encode is True.
The host must already be encoded with _encode_host.
"""
if host is None:
return ""
ret = host
if port is not None:
ret = f"{ret}:{port}"
if user is None and password is None:
return ret
if password is not None:
if not user:
user = ""
elif encode:
user = QUOTER(user)
if encode:
password = QUOTER(password)
user = f"{user}:{password}"
elif user and encode:
user = QUOTER(user)
return f"{user}@{ret}" if user else ret
def query_to_pairs(query_string: str) -> list[tuple[str, str]]:
"""Parse a query given as a string argument.
Works like urllib.parse.parse_qsl with keep empty values.
"""
pairs: list[tuple[str, str]] = []
if not query_string:
return pairs
for k_v in query_string.split("&"):
k, _, v = k_v.partition("=")
pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))
return pairs

View File

@@ -0,0 +1,41 @@
"""Utilities for working with paths."""
from collections.abc import Sequence
from contextlib import suppress
def normalize_path_segments(segments: Sequence[str]) -> list[str]:
"""Drop '.' and '..' from a sequence of str segments"""
resolved_path: list[str] = []
for seg in segments:
if seg == "..":
# ignore any .. segments that would otherwise cause an
# IndexError when popped from resolved_path if
# resolving for rfc3986
with suppress(IndexError):
resolved_path.pop()
elif seg != ".":
resolved_path.append(seg)
if segments and segments[-1] in (".", ".."):
# do some post-processing here.
# if the last segment was a relative dir,
# then we need to append the trailing '/'
resolved_path.append("")
return resolved_path
def normalize_path(path: str) -> str:
# Drop '.' and '..' from str path
prefix = ""
if path and path[0] == "/":
# preserve the "/" root element of absolute paths, copying it to the
# normalised output as per sections 5.2.4 and 6.2.2.3 of rfc3986.
prefix = "/"
path = path[1:]
segments = path.split("/")
return prefix + "/".join(normalize_path_segments(segments))

View File

@@ -0,0 +1,114 @@
"""Query string handling."""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, SupportsInt, Union
from multidict import istr
from ._quoters import QUERY_PART_QUOTER, QUERY_QUOTER
SimpleQuery = Union[str, SupportsInt, float]
QueryVariable = Union[SimpleQuery, Sequence[SimpleQuery]]
Query = Union[
None, str, Mapping[str, QueryVariable], Sequence[tuple[str, QueryVariable]]
]
def query_var(v: SimpleQuery) -> str:
"""Convert a query variable to a string."""
cls = type(v)
if cls is int: # Fast path for non-subclassed int
return str(v)
if isinstance(v, str):
return v
if isinstance(v, float):
if math.isinf(v):
raise ValueError("float('inf') is not supported")
if math.isnan(v):
raise ValueError("float('nan') is not supported")
return str(float(v))
if cls is not bool and isinstance(v, SupportsInt):
return str(int(v))
raise TypeError(
"Invalid variable type: value "
"should be str, int or float, got {!r} "
"of type {}".format(v, cls)
)
def get_str_query_from_sequence_iterable(
items: Iterable[tuple[Union[str, istr], QueryVariable]],
) -> str:
"""Return a query string from a sequence of (key, value) pairs.
value is a single value or a sequence of values for the key
The sequence of values must be a list or tuple.
"""
quoter = QUERY_PART_QUOTER
pairs = [
f"{quoter(k)}={quoter(v if type(v) is str else query_var(v))}"
for k, val in items
for v in (
val if type(val) is not str and isinstance(val, (list, tuple)) else (val,)
)
]
return "&".join(pairs)
def get_str_query_from_iterable(
items: Iterable[tuple[Union[str, istr], SimpleQuery]],
) -> str:
"""Return a query string from an iterable.
The iterable must contain (key, value) pairs.
The values are not allowed to be sequences, only single values are
allowed. For sequences, use `_get_str_query_from_sequence_iterable`.
"""
quoter = QUERY_PART_QUOTER
# A listcomp is used since listcomps are inlined on CPython 3.12+ and
# they are a bit faster than a generator expression.
pairs = [
f"{quoter(k)}={quoter(v if type(v) is str else query_var(v))}" for k, v in items
]
return "&".join(pairs)
def get_str_query(*args: Any, **kwargs: Any) -> Union[str, None]:
"""Return a query string from supported args."""
query: Union[str, Mapping[str, QueryVariable], None]
if kwargs:
if args:
msg = "Either kwargs or single query parameter must be present"
raise ValueError(msg)
query = kwargs
elif len(args) == 1:
query = args[0]
else:
raise ValueError("Either kwargs or single query parameter must be present")
if query is None:
return None
if not query:
return ""
if type(query) is dict:
return get_str_query_from_sequence_iterable(query.items())
if type(query) is str or isinstance(query, str):
return QUERY_QUOTER(query)
if isinstance(query, Mapping):
return get_str_query_from_sequence_iterable(query.items())
if isinstance(query, (bytes, bytearray, memoryview)): # type: ignore[unreachable]
msg = "Invalid query type: bytes, bytearray and memoryview are forbidden"
raise TypeError(msg)
if isinstance(query, Sequence):
# We don't expect sequence values if we're given a list of pairs
# already; only mappings like builtin `dict` which can't have the
# same key pointing to multiple values are allowed to use
# `_query_seq_pairs`.
return get_str_query_from_iterable(query)
raise TypeError(
"Invalid query type: only str, mapping or "
"sequence of (key, value) pairs is allowed"
)

View File

@@ -0,0 +1,33 @@
"""Quoting and unquoting utilities for URL parts."""
from typing import Union
from urllib.parse import quote
from ._quoting import _Quoter, _Unquoter
QUOTER = _Quoter(requote=False)
REQUOTER = _Quoter()
PATH_QUOTER = _Quoter(safe="@:", protected="/+", requote=False)
PATH_REQUOTER = _Quoter(safe="@:", protected="/+")
QUERY_QUOTER = _Quoter(safe="?/:@", protected="=+&;", qs=True, requote=False)
QUERY_REQUOTER = _Quoter(safe="?/:@", protected="=+&;", qs=True)
QUERY_PART_QUOTER = _Quoter(safe="?/:@", qs=True, requote=False)
FRAGMENT_QUOTER = _Quoter(safe="?/:@", requote=False)
FRAGMENT_REQUOTER = _Quoter(safe="?/:@")
UNQUOTER = _Unquoter()
PATH_UNQUOTER = _Unquoter(unsafe="+")
PATH_SAFE_UNQUOTER = _Unquoter(ignore="/%", unsafe="+")
QS_UNQUOTER = _Unquoter(qs=True)
UNQUOTER_PLUS = _Unquoter(plus=True) # to match urllib.parse.unquote_plus
def human_quote(s: Union[str, None], unsafe: str) -> Union[str, None]:
if not s:
return s
for c in "%" + unsafe:
if c in s:
s = s.replace(c, f"%{ord(c):02X}")
if s.isprintable():
return s
return "".join(c if c.isprintable() else quote(c) for c in s)

View File

@@ -0,0 +1,19 @@
import os
import sys
from typing import TYPE_CHECKING
__all__ = ("_Quoter", "_Unquoter")
NO_EXTENSIONS = bool(os.environ.get("YARL_NO_EXTENSIONS")) # type: bool
if sys.implementation.name != "cpython":
NO_EXTENSIONS = True
if TYPE_CHECKING or NO_EXTENSIONS:
from ._quoting_py import _Quoter, _Unquoter
else:
try:
from ._quoting_c import _Quoter, _Unquoter
except ImportError: # pragma: no cover
from ._quoting_py import _Quoter, _Unquoter # type: ignore[assignment]

View File

@@ -0,0 +1,453 @@
# cython: language_level=3, freethreading_compatible=True
from cpython.exc cimport PyErr_NoMemory
from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
from cpython.unicode cimport (
PyUnicode_DATA,
PyUnicode_DecodeASCII,
PyUnicode_DecodeUTF8Stateful,
PyUnicode_GET_LENGTH,
PyUnicode_KIND,
PyUnicode_READ,
)
from libc.stdint cimport uint8_t, uint64_t
from libc.string cimport memcpy, memset
from string import ascii_letters, digits
cdef str GEN_DELIMS = ":/?#[]@"
cdef str SUB_DELIMS_WITHOUT_QS = "!$'()*,"
cdef str SUB_DELIMS = SUB_DELIMS_WITHOUT_QS + '+?=;'
cdef str RESERVED = GEN_DELIMS + SUB_DELIMS
cdef str UNRESERVED = ascii_letters + digits + '-._~'
cdef str ALLOWED = UNRESERVED + SUB_DELIMS_WITHOUT_QS
cdef str QS = '+&=;'
DEF BUF_SIZE = 8 * 1024 # 8KiB
cdef inline Py_UCS4 _to_hex(uint8_t v) noexcept:
if v < 10:
return <Py_UCS4>(v+0x30) # ord('0') == 0x30
else:
return <Py_UCS4>(v+0x41-10) # ord('A') == 0x41
cdef inline int _from_hex(Py_UCS4 v) noexcept:
if '0' <= v <= '9':
return <int>(v) - 0x30 # ord('0') == 0x30
elif 'A' <= v <= 'F':
return <int>(v) - 0x41 + 10 # ord('A') == 0x41
elif 'a' <= v <= 'f':
return <int>(v) - 0x61 + 10 # ord('a') == 0x61
else:
return -1
cdef inline int _is_lower_hex(Py_UCS4 v) noexcept:
return 'a' <= v <= 'f'
cdef inline long _restore_ch(Py_UCS4 d1, Py_UCS4 d2):
cdef int digit1 = _from_hex(d1)
if digit1 < 0:
return -1
cdef int digit2 = _from_hex(d2)
if digit2 < 0:
return -1
return digit1 << 4 | digit2
cdef uint8_t ALLOWED_TABLE[16]
cdef uint8_t ALLOWED_NOTQS_TABLE[16]
cdef inline bint bit_at(uint8_t array[], uint64_t ch) noexcept:
return array[ch >> 3] & (1 << (ch & 7))
cdef inline void set_bit(uint8_t array[], uint64_t ch) noexcept:
array[ch >> 3] |= (1 << (ch & 7))
memset(ALLOWED_TABLE, 0, sizeof(ALLOWED_TABLE))
memset(ALLOWED_NOTQS_TABLE, 0, sizeof(ALLOWED_NOTQS_TABLE))
for i in range(128):
if chr(i) in ALLOWED:
set_bit(ALLOWED_TABLE, i)
set_bit(ALLOWED_NOTQS_TABLE, i)
if chr(i) in QS:
set_bit(ALLOWED_NOTQS_TABLE, i)
# ----------------- writer ---------------------------
cdef struct Writer:
char *buf
bint heap_allocated_buf
Py_ssize_t size
Py_ssize_t pos
bint changed
cdef inline void _init_writer(Writer* writer, char* buf):
writer.buf = buf
writer.heap_allocated_buf = False
writer.size = BUF_SIZE
writer.pos = 0
writer.changed = 0
cdef inline void _release_writer(Writer* writer):
if writer.heap_allocated_buf:
PyMem_Free(writer.buf)
cdef inline int _write_char(Writer* writer, Py_UCS4 ch, bint changed):
cdef char * buf
cdef Py_ssize_t size
if writer.pos == writer.size:
# reallocate
size = writer.size + BUF_SIZE
if not writer.heap_allocated_buf:
buf = <char*>PyMem_Malloc(size)
if buf == NULL:
PyErr_NoMemory()
return -1
memcpy(buf, writer.buf, writer.size)
writer.heap_allocated_buf = True
else:
buf = <char*>PyMem_Realloc(writer.buf, size)
if buf == NULL:
PyErr_NoMemory()
return -1
writer.buf = buf
writer.size = size
writer.buf[writer.pos] = <char>ch
writer.pos += 1
writer.changed |= changed
return 0
cdef inline int _write_pct(Writer* writer, uint8_t ch, bint changed):
if _write_char(writer, '%', changed) < 0:
return -1
if _write_char(writer, _to_hex(<uint8_t>ch >> 4), changed) < 0:
return -1
return _write_char(writer, _to_hex(<uint8_t>ch & 0x0f), changed)
cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol):
cdef uint64_t utf = <uint64_t> symbol
if utf < 0x80:
return _write_pct(writer, <uint8_t>utf, True)
elif utf < 0x800:
if _write_pct(writer, <uint8_t>(0xc0 | (utf >> 6)), True) < 0:
return -1
return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
elif 0xD800 <= utf <= 0xDFFF:
# surogate pair, ignored
return 0
elif utf < 0x10000:
if _write_pct(writer, <uint8_t>(0xe0 | (utf >> 12)), True) < 0:
return -1
if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
True) < 0:
return -1
return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
elif utf > 0x10FFFF:
# symbol is too large
return 0
else:
if _write_pct(writer, <uint8_t>(0xf0 | (utf >> 18)), True) < 0:
return -1
if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 12) & 0x3f)),
True) < 0:
return -1
if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
True) < 0:
return -1
return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
# --------------------- end writer --------------------------
cdef class _Quoter:
cdef bint _qs
cdef bint _requote
cdef uint8_t _safe_table[16]
cdef uint8_t _protected_table[16]
def __init__(
self, *, str safe='', str protected='', bint qs=False, bint requote=True,
):
cdef Py_UCS4 ch
self._qs = qs
self._requote = requote
if not self._qs:
memcpy(self._safe_table,
ALLOWED_NOTQS_TABLE,
sizeof(self._safe_table))
else:
memcpy(self._safe_table,
ALLOWED_TABLE,
sizeof(self._safe_table))
for ch in safe:
if ord(ch) > 127:
raise ValueError("Only safe symbols with ORD < 128 are allowed")
set_bit(self._safe_table, ch)
memset(self._protected_table, 0, sizeof(self._protected_table))
for ch in protected:
if ord(ch) > 127:
raise ValueError("Only safe symbols with ORD < 128 are allowed")
set_bit(self._safe_table, ch)
set_bit(self._protected_table, ch)
def __call__(self, val):
if val is None:
return None
if type(val) is not str:
if isinstance(val, str):
# derived from str
val = str(val)
else:
raise TypeError("Argument should be str")
return self._do_quote_or_skip(<str>val)
cdef str _do_quote_or_skip(self, str val):
cdef char[BUF_SIZE] buffer
cdef Py_UCS4 ch
cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
cdef Py_ssize_t idx = length
cdef bint must_quote = 0
cdef Writer writer
cdef int kind = PyUnicode_KIND(val)
cdef const void *data = PyUnicode_DATA(val)
# If everything in the string is in the safe
# table and all ASCII, we can skip quoting
while idx:
idx -= 1
ch = PyUnicode_READ(kind, data, idx)
if ch >= 128 or not bit_at(self._safe_table, ch):
must_quote = 1
break
if not must_quote:
return val
_init_writer(&writer, &buffer[0])
try:
return self._do_quote(<str>val, length, kind, data, &writer)
finally:
_release_writer(&writer)
cdef str _do_quote(
self,
str val,
Py_ssize_t length,
int kind,
const void *data,
Writer *writer
):
cdef Py_UCS4 ch
cdef long chl
cdef int changed
cdef Py_ssize_t idx = 0
while idx < length:
ch = PyUnicode_READ(kind, data, idx)
idx += 1
if ch == '%' and self._requote and idx <= length - 2:
chl = _restore_ch(
PyUnicode_READ(kind, data, idx),
PyUnicode_READ(kind, data, idx + 1)
)
if chl != -1:
ch = <Py_UCS4>chl
idx += 2
if ch < 128:
if bit_at(self._protected_table, ch):
if _write_pct(writer, ch, True) < 0:
raise
continue
if bit_at(self._safe_table, ch):
if _write_char(writer, ch, True) < 0:
raise
continue
changed = (_is_lower_hex(PyUnicode_READ(kind, data, idx - 2)) or
_is_lower_hex(PyUnicode_READ(kind, data, idx - 1)))
if _write_pct(writer, ch, changed) < 0:
raise
continue
else:
ch = '%'
if self._write(writer, ch) < 0:
raise
if not writer.changed:
return val
else:
return PyUnicode_DecodeASCII(writer.buf, writer.pos, "strict")
cdef inline int _write(self, Writer *writer, Py_UCS4 ch):
if self._qs:
if ch == ' ':
return _write_char(writer, '+', True)
if ch < 128 and bit_at(self._safe_table, ch):
return _write_char(writer, ch, False)
return _write_utf8(writer, ch)
cdef class _Unquoter:
cdef str _ignore
cdef bint _has_ignore
cdef str _unsafe
cdef bytes _unsafe_bytes
cdef Py_ssize_t _unsafe_bytes_len
cdef const unsigned char * _unsafe_bytes_char
cdef bint _qs
cdef bint _plus # to match urllib.parse.unquote_plus
cdef _Quoter _quoter
cdef _Quoter _qs_quoter
def __init__(self, *, ignore="", unsafe="", qs=False, plus=False):
self._ignore = ignore
self._has_ignore = bool(self._ignore)
self._unsafe = unsafe
# unsafe may only be extended ascii characters (0-255)
self._unsafe_bytes = self._unsafe.encode('ascii')
self._unsafe_bytes_len = len(self._unsafe_bytes)
self._unsafe_bytes_char = self._unsafe_bytes
self._qs = qs
self._plus = plus
self._quoter = _Quoter()
self._qs_quoter = _Quoter(qs=True)
def __call__(self, val):
if val is None:
return None
if type(val) is not str:
if isinstance(val, str):
# derived from str
val = str(val)
else:
raise TypeError("Argument should be str")
return self._do_unquote(<str>val)
cdef str _do_unquote(self, str val):
cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
if length == 0:
return val
cdef list ret = []
cdef char buffer[4]
cdef Py_ssize_t buflen = 0
cdef Py_ssize_t consumed
cdef str unquoted
cdef Py_UCS4 ch = 0
cdef long chl = 0
cdef Py_ssize_t idx = 0
cdef Py_ssize_t start_pct
cdef int kind = PyUnicode_KIND(val)
cdef const void *data = PyUnicode_DATA(val)
cdef bint changed = 0
while idx < length:
ch = PyUnicode_READ(kind, data, idx)
idx += 1
if ch == '%' and idx <= length - 2:
changed = 1
chl = _restore_ch(
PyUnicode_READ(kind, data, idx),
PyUnicode_READ(kind, data, idx + 1)
)
if chl != -1:
ch = <Py_UCS4>chl
idx += 2
assert buflen < 4
buffer[buflen] = ch
buflen += 1
try:
unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
NULL, &consumed)
except UnicodeDecodeError:
start_pct = idx - buflen * 3
buffer[0] = ch
buflen = 1
ret.append(val[start_pct : idx - 3])
try:
unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
NULL, &consumed)
except UnicodeDecodeError:
buflen = 0
ret.append(val[idx - 3 : idx])
continue
if not unquoted:
assert consumed == 0
continue
assert consumed == buflen
buflen = 0
if self._qs and unquoted in '+=&;':
ret.append(self._qs_quoter(unquoted))
elif (
(self._unsafe_bytes_len and unquoted in self._unsafe) or
(self._has_ignore and unquoted in self._ignore)
):
ret.append(self._quoter(unquoted))
else:
ret.append(unquoted)
continue
else:
ch = '%'
if buflen:
start_pct = idx - 1 - buflen * 3
ret.append(val[start_pct : idx - 1])
buflen = 0
if ch == '+':
if (
(not self._qs and not self._plus) or
(self._unsafe_bytes_len and self._is_char_unsafe(ch))
):
ret.append('+')
else:
changed = 1
ret.append(' ')
continue
if self._unsafe_bytes_len and self._is_char_unsafe(ch):
changed = 1
ret.append('%')
h = hex(ord(ch)).upper()[2:]
for ch in h:
ret.append(ch)
continue
ret.append(ch)
if not changed:
return val
if buflen:
ret.append(val[length - buflen * 3 : length])
return ''.join(ret)
cdef inline bint _is_char_unsafe(self, Py_UCS4 ch):
for i in range(self._unsafe_bytes_len):
if ch == self._unsafe_bytes_char[i]:
return True
return False

View File

@@ -0,0 +1,213 @@
import codecs
import re
from string import ascii_letters, ascii_lowercase, digits
from typing import Union, cast, overload
BASCII_LOWERCASE = ascii_lowercase.encode("ascii")
BPCT_ALLOWED = {f"%{i:02X}".encode("ascii") for i in range(256)}
GEN_DELIMS = ":/?#[]@"
SUB_DELIMS_WITHOUT_QS = "!$'()*,"
SUB_DELIMS = SUB_DELIMS_WITHOUT_QS + "+&=;"
RESERVED = GEN_DELIMS + SUB_DELIMS
UNRESERVED = ascii_letters + digits + "-._~"
ALLOWED = UNRESERVED + SUB_DELIMS_WITHOUT_QS
_IS_HEX = re.compile(b"[A-Z0-9][A-Z0-9]")
_IS_HEX_STR = re.compile("[A-Fa-f0-9][A-Fa-f0-9]")
utf8_decoder = codecs.getincrementaldecoder("utf-8")
class _Quoter:
def __init__(
self,
*,
safe: str = "",
protected: str = "",
qs: bool = False,
requote: bool = True,
) -> None:
self._safe = safe
self._protected = protected
self._qs = qs
self._requote = requote
@overload
def __call__(self, val: str) -> str: ...
@overload
def __call__(self, val: None) -> None: ...
def __call__(self, val: Union[str, None]) -> Union[str, None]:
if val is None:
return None
if not isinstance(val, str):
raise TypeError("Argument should be str")
if not val:
return ""
bval = val.encode("utf8", errors="ignore")
ret = bytearray()
pct = bytearray()
safe = self._safe
safe += ALLOWED
if not self._qs:
safe += "+&=;"
safe += self._protected
bsafe = safe.encode("ascii")
idx = 0
while idx < len(bval):
ch = bval[idx]
idx += 1
if pct:
if ch in BASCII_LOWERCASE:
ch = ch - 32 # convert to uppercase
pct.append(ch)
if len(pct) == 3: # pragma: no branch # peephole optimizer
buf = pct[1:]
if not _IS_HEX.match(buf):
ret.extend(b"%25")
pct.clear()
idx -= 2
continue
try:
unquoted = chr(int(pct[1:].decode("ascii"), base=16))
except ValueError:
ret.extend(b"%25")
pct.clear()
idx -= 2
continue
if unquoted in self._protected:
ret.extend(pct)
elif unquoted in safe:
ret.append(ord(unquoted))
else:
ret.extend(pct)
pct.clear()
# special case, if we have only one char after "%"
elif len(pct) == 2 and idx == len(bval):
ret.extend(b"%25")
pct.clear()
idx -= 1
continue
elif ch == ord("%") and self._requote:
pct.clear()
pct.append(ch)
# special case if "%" is last char
if idx == len(bval):
ret.extend(b"%25")
continue
if self._qs and ch == ord(" "):
ret.append(ord("+"))
continue
if ch in bsafe:
ret.append(ch)
continue
ret.extend((f"%{ch:02X}").encode("ascii"))
ret2 = ret.decode("ascii")
if ret2 == val:
return val
return ret2
class _Unquoter:
def __init__(
self,
*,
ignore: str = "",
unsafe: str = "",
qs: bool = False,
plus: bool = False,
) -> None:
self._ignore = ignore
self._unsafe = unsafe
self._qs = qs
self._plus = plus # to match urllib.parse.unquote_plus
self._quoter = _Quoter()
self._qs_quoter = _Quoter(qs=True)
@overload
def __call__(self, val: str) -> str: ...
@overload
def __call__(self, val: None) -> None: ...
def __call__(self, val: Union[str, None]) -> Union[str, None]:
if val is None:
return None
if not isinstance(val, str):
raise TypeError("Argument should be str")
if not val:
return ""
decoder = cast(codecs.BufferedIncrementalDecoder, utf8_decoder())
ret = []
idx = 0
while idx < len(val):
ch = val[idx]
idx += 1
if ch == "%" and idx <= len(val) - 2:
pct = val[idx : idx + 2]
if _IS_HEX_STR.fullmatch(pct):
b = bytes([int(pct, base=16)])
idx += 2
try:
unquoted = decoder.decode(b)
except UnicodeDecodeError:
start_pct = idx - 3 - len(decoder.buffer) * 3
ret.append(val[start_pct : idx - 3])
decoder.reset()
try:
unquoted = decoder.decode(b)
except UnicodeDecodeError:
ret.append(val[idx - 3 : idx])
continue
if not unquoted:
continue
if self._qs and unquoted in "+=&;":
to_add = self._qs_quoter(unquoted)
if to_add is None: # pragma: no cover
raise RuntimeError("Cannot quote None")
ret.append(to_add)
elif unquoted in self._unsafe or unquoted in self._ignore:
to_add = self._quoter(unquoted)
if to_add is None: # pragma: no cover
raise RuntimeError("Cannot quote None")
ret.append(to_add)
else:
ret.append(unquoted)
continue
if decoder.buffer:
start_pct = idx - 1 - len(decoder.buffer) * 3
ret.append(val[start_pct : idx - 1])
decoder.reset()
if ch == "+":
if (not self._qs and not self._plus) or ch in self._unsafe:
ret.append("+")
else:
ret.append(" ")
continue
if ch in self._unsafe:
ret.append("%")
h = hex(ord(ch)).upper()[2:]
for ch in h:
ret.append(ch)
continue
ret.append(ch)
if decoder.buffer:
ret.append(val[-len(decoder.buffer) * 3 :])
ret2 = "".join(ret)
if ret2 == val:
return val
return ret2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
# Placeholder