From 8f4650f7423b58cd8d6839afc2651357ee7b9c45 Mon Sep 17 00:00:00 2001 From: Alexandre Gomes Gaigalas Date: Tue, 3 Feb 2026 23:17:26 -0300 Subject: [PATCH] Compile the pattern for PatternFormatter The verification that was made upon the pattern was almost like a parser/tokenizer in itself. This change leverages that behavior to introduce full pattern compilation. When the PatternFormatter encounters a novel pattern that it has never seen before, it compiles that pattern into a 3-step CompiledPattern instance that has a search regex, a replacement pattern and instructions for the callback. Upon seeing already compiled patterns, all the PatternFormatter has to do is perform the motions (one preg_replace_callback) of the existing compiled pattern. Further steps for pattern canonicalization could be taken, such as normalizing equivalent patterns into a single form, so they could share the same cached space. However, that micro-optimization was too expensive and counter-productive. This change also opens up possibilities for in-file warmup, as CompiledPattern instances are simple objects. An user could pre-compile his/her hot-path patterns beforehand to share the cache even across diferent processes. --- src/Internal/CompiledPattern.php | 204 ++++++++++++++++++++++++ src/PatternFormatter.php | 232 +++------------------------- tests/Unit/PatternFormatterTest.php | 7 + 3 files changed, 230 insertions(+), 213 deletions(-) create mode 100644 src/Internal/CompiledPattern.php diff --git a/src/Internal/CompiledPattern.php b/src/Internal/CompiledPattern.php new file mode 100644 index 0000000..3328097 --- /dev/null +++ b/src/Internal/CompiledPattern.php @@ -0,0 +1,204 @@ + + */ + +declare(strict_types=1); + +namespace Respect\StringFormatter\Internal; + +use Respect\StringFormatter\InvalidFormatterException; + +use function array_keys; +use function count; +use function implode; +use function mb_strtolower; +use function mb_strtoupper; +use function mb_substr; +use function preg_match; +use function preg_match_all; +use function sprintf; +use function str_starts_with; +use function strtolower; +use function substr; + +use const PREG_OFFSET_CAPTURE; + +final class CompiledPattern +{ + private const array FILTERS = [ + '#' => '.', + '0' => '\p{N}', + 'A' => '\p{Lu}', + 'a' => '\p{Ll}', + 'C' => '\p{L}', + 'W' => '\p{L}|\p{N}', + ]; + + private const array TRANSFORM_MAP = ['l' => 'lower', 'u' => 'upper', 'i' => 'invert']; + + /** @var array */ + private static array $compiledPatterns = []; + + /** @var array */ + private static array $compiledQualifiers = []; + + /** @param array $instructions */ + private function __construct( + private(set) readonly string $pattern, + private(set) readonly string $search, + private(set) readonly string $replacement, + private(set) readonly array $instructions, + ) { + } + + public static function compile(string $pattern): self + { + if (isset(self::$compiledPatterns[$pattern])) { + return self::$compiledPatterns[$pattern]; + } + + if ($pattern === '') { + throw new InvalidFormatterException('Pattern cannot be empty'); + } + + $search = ''; + $replacement = ''; + $instructions = []; + $groupIndex = 1; + + $transformState = null; + $nextTransform = null; + + preg_match_all(sprintf( + '/(?:\\\\.|[%1$s]|(?:\{[^}]*\}|[*+?])|[^\\\%1$s{}+*?]+|.)/u', + implode('', array_keys(self::FILTERS)), + ), $pattern, $tokens, PREG_OFFSET_CAPTURE); + + $tokenList = $tokens[0]; + $count = count($tokenList); + + for ($i = 0; $i < $count; $i++) { + [$tokenText, $offset] = $tokenList[$i]; + + if (str_starts_with($tokenText, '\\')) { + if ($tokenText === '\\') { + throw new InvalidFormatterException('Incomplete escape sequence at end of pattern'); + } + + $char = mb_substr($tokenText, 1); + + if ($char === 'd') { + $inner = '.'; + $search .= sprintf('((?:.*?%s){0,1})', $inner); + $replacement .= sprintf('%%%d$', $groupIndex); + $instructions[$groupIndex] = ['filter' => sprintf('/%s/u', $inner), 'transform' => 'delete']; + $groupIndex++; + continue; + } + + if ($char === 'E') { + $transformState = null; + continue; + } + + if (isset(self::TRANSFORM_MAP[$char])) { + $nextTransform = self::TRANSFORM_MAP[$char]; + continue; + } + + $lowerChar = strtolower($char); + if (isset(self::TRANSFORM_MAP[$lowerChar]) && $char !== $lowerChar) { + $transformState = self::TRANSFORM_MAP[$lowerChar]; + continue; + } + + $replacement .= $char; + continue; + } + + if (isset(self::FILTERS[$tokenText])) { + $filterChar = $tokenText; + $regexQuantifier = '{0,1}'; + + if (isset($tokenList[$i + 1]) && preg_match('/^(?:\{[^}]*\}|[*+?])$/u', $tokenList[$i + 1][0])) { + $i++; + $regexQuantifier = self::compileQualifier($tokenList[$i][0], $tokenList[$i][1]); + } + + $inner = self::FILTERS[$filterChar]; + $search .= sprintf('((?:.*?%s)%s)', $inner, $regexQuantifier); + + $replacement .= sprintf('%%%d$', $groupIndex); + $instructions[$groupIndex] = [ + 'filter' => sprintf('/%s/u', $inner), + 'transform' => $nextTransform ?? $transformState, + ]; + + $groupIndex++; + $nextTransform = null; + continue; + } + + if (preg_match('/^(?:\{[^}]*\}|[*+?])$/u', $tokenText)) { + throw new InvalidFormatterException( + sprintf('Quantifier "%s" must follow a filter pattern at position %d', $tokenText[0], $offset), + ); + } + + if (str_starts_with($tokenText, '{')) { + throw new InvalidFormatterException( + sprintf('Invalid or malformed quantifier at position %d', $offset), + ); + } + + $replacement .= $tokenText; + } + + return self::$compiledPatterns[$pattern] = new self( + $pattern, + '/^' . $search . '/us', + $replacement, + $instructions, + ); + } + + public static function transform(string $val, string|null $transform): string + { + return match ($transform) { + 'delete' => '', + 'lower' => mb_strtolower($val), + 'upper' => mb_strtoupper($val), + 'invert' => mb_strtolower($val) ^ mb_strtoupper($val) ^ $val, + default => $val, + }; + } + + private static function compileQualifier(string $token, int $offset): string + { + if (isset(self::$compiledQualifiers[$token])) { + return self::$compiledQualifiers[$token]; + } + + if ($token === '*') { + return '*'; + } + + if ($token === '+') { + return '{1,}'; + } + + $content = substr($token, 1, -1); + if ($content === '' || $content === ',' || !preg_match('/^(\d+(?:,\d*)?|,\d+)$/', $content)) { + throw new InvalidFormatterException(sprintf('Invalid or malformed quantifier at position %d', $offset)); + } + + preg_match('/^\{(\d*)(?:,(\d*))?\}$/', $token, $m); + $max = $m[2] ?? $m[1]; + + return self::$compiledQualifiers[$token] = $max === '' ? '*' : sprintf('{0,%s}', $max); + } +} diff --git a/src/PatternFormatter.php b/src/PatternFormatter.php index d986bd1..f193d54 100644 --- a/src/PatternFormatter.php +++ b/src/PatternFormatter.php @@ -3,6 +3,7 @@ /* * SPDX-FileCopyrightText: (c) Respect Project Contributors * SPDX-License-Identifier: ISC + * SPDX-FileContributor: Alexandre Gomes Gaigalas * SPDX-FileContributor: Henrique Moody */ @@ -10,232 +11,37 @@ namespace Respect\StringFormatter; -use function array_key_exists; -use function count; +use Respect\StringFormatter\Internal\CompiledPattern; + use function implode; -use function lcfirst; -use function mb_str_split; -use function mb_strlen; -use function mb_strtolower; -use function mb_strtoupper; -use function mb_substr; use function preg_match; -use function sprintf; +use function preg_match_all; +use function preg_replace_callback; final readonly class PatternFormatter implements Formatter { - private const array FILTERS = [ - '#' => '/^.$/u', - '0' => '/^[0-9]$/', - 'A' => '/^[A-Z]$/', - 'a' => '/^[a-z]$/', - 'C' => '/^\p{L}$/u', - 'W' => '/^[\p{L}\p{N}]$/u', - ]; - - private const array TRANSFORMATIONS = [ - 'd' => 'delete', - 'l' => 'lower', - 'L' => 'LOWER', - 'u' => 'upper', - 'U' => 'UPPER', - 'i' => 'invert', - 'I' => 'INVERT', - 'E' => 'reset', - ]; - - public function __construct( - private string $pattern, - ) { - $this->validatePattern(); - } + private CompiledPattern $compiledPattern; - public function format(string $input): string + public function __construct(private string $pattern) { - $chars = mb_str_split($input); - $charIndex = 0; - $output = []; - $transform = null; - $patternLength = mb_strlen($this->pattern); - - for ($i = 0; $i < $patternLength; $i++) { - $char = mb_substr($this->pattern, $i, 1); - - // Handle escape sequences - if ($char === '\\' && $i + 1 < $patternLength) { - $next = mb_substr($this->pattern, $i + 1, 1); - - if (array_key_exists($next, self::TRANSFORMATIONS)) { - $type = self::TRANSFORMATIONS[$next]; - if ($type === 'delete') { - $charIndex++; - } elseif ($type === 'reset') { - $transform = null; - } else { - $transform = $type; - } - - $i++; - continue; - } - - // Escaped literal character - $output[] = $next; - $i++; - continue; - } - - // Handle filter patterns - if (array_key_exists($char, self::FILTERS)) { - $repetition = $this->parseRepetition($i + 1); - if ($repetition !== null) { - [, $max, $consumed] = $repetition; - $i += $consumed; - } else { - $max = 1; - } - - $count = 0; - while (($max === null || $count < $max) && $charIndex < count($chars)) { - if (!$this->matches($char, $chars[$charIndex])) { - $charIndex++; - continue; - } - - $output[] = $this->applyTransform($chars[$charIndex++], $transform); - $count++; - - if ($transform === null || $transform !== lcfirst($transform)) { - continue; - } - - $transform = null; // Clear single-use (lowercase) transformations - } - - continue; - } - - // Literal character - $output[] = $char; - } - - return implode('', $output); + $this->compiledPattern = CompiledPattern::compile($this->pattern); } - private function validatePattern(): void + public function format(string $input): string { - if ($this->pattern === '') { - throw new InvalidFormatterException('Pattern cannot be empty'); - } - - $length = mb_strlen($this->pattern); - - for ($i = 0; $i < $length; $i++) { - $char = mb_substr($this->pattern, $i, 1); + $matches = []; + preg_match($this->compiledPattern->search, $input, $matches); - // Check escape sequences - if ($char === '\\') { - if ($i + 1 >= $length) { - throw new InvalidFormatterException('Incomplete escape sequence at end of pattern'); - } - - $i++; // Skip the escaped character - continue; + return preg_replace_callback('/%(\d+)\$/', function (array $m) use ($matches): string { + $idx = (int) $m[1]; + if (!isset($matches[$idx]) || $matches[$idx] === '') { + return ''; } - // Check for orphaned quantifiers (not after a filter) - if ($char === '+' || $char === '*') { - throw new InvalidFormatterException( - sprintf('Quantifier "%s" must follow a filter pattern at position %d', $char, $i), - ); - } + $instr = $this->compiledPattern->instructions[$idx]; + preg_match_all($instr['filter'], $matches[$idx], $subMatches); - // Check for brace quantifiers - if ($char === '{') { - $remaining = mb_substr($this->pattern, $i); - if (!$this->isValidBraceQuantifier($remaining)) { - throw new InvalidFormatterException( - sprintf('Invalid or malformed quantifier at position %d', $i), - ); - } - } - - // If it's a filter, skip any following quantifier - if (!array_key_exists($char, self::FILTERS)) { - continue; - } - - $repetition = $this->parseRepetition($i + 1); - if ($repetition === null) { - continue; - } - - $i += $repetition[2]; - } - } - - private function isValidBraceQuantifier(string $remaining): bool - { - // Matches exact count, range with min, or range with max only - return preg_match('/^\{(\d+)\}/', $remaining) === 1 - || preg_match('/^\{(\d+),(\d*)\}/', $remaining) === 1 - || preg_match('/^\{,(\d+)\}/', $remaining) === 1; - } - - /** - * Parses a repetition quantifier (+, *, {n}, {n,}, {,m}, or {n,m}) starting at the given position. - * - * @return array{int, int|null, int}|null Returns [min, max, consumed chars] or null if no valid quantifier - */ - private function parseRepetition(int $position): array|null - { - $remaining = mb_substr($this->pattern, $position); - - // Match + for one or more - if (mb_substr($remaining, 0, 1) === '+') { - return [1, null, 1]; - } - - // Match * for zero or more - if (mb_substr($remaining, 0, 1) === '*') { - return [0, null, 1]; - } - - // Match {n} for exact count - if (preg_match('/^\{(\d+)\}/', $remaining, $matches) === 1) { - $count = (int) $matches[1]; - - return [$count, $count, mb_strlen($matches[0])]; - } - - // Match range quantifiers with minimum specified - if (preg_match('/^\{(\d+),(\d*)\}/', $remaining, $matches) === 1) { - $min = (int) $matches[1]; - $max = $matches[2] === '' ? null : (int) $matches[2]; - - return [$min, $max, mb_strlen($matches[0])]; - } - - // Match range quantifiers with only maximum specified - if (preg_match('/^\{,(\d+)\}/', $remaining, $matches) === 1) { - return [0, (int) $matches[1], mb_strlen($matches[0])]; - } - - return null; - } - - private function matches(string $filter, string $char): bool - { - return preg_match(self::FILTERS[$filter], $char) === 1; - } - - private function applyTransform(string $char, string|null $transform): string - { - return match ($transform) { - 'lower', 'LOWER' => mb_strtolower($char), - 'upper', 'UPPER' => mb_strtoupper($char), - 'invert', 'INVERT' => mb_strtolower($char) === $char ? mb_strtoupper($char) : mb_strtolower($char), - default => $char, - }; + return CompiledPattern::transform(implode('', $subMatches[0]), $instr['transform']); + }, $this->compiledPattern->replacement) ?? ''; } } diff --git a/tests/Unit/PatternFormatterTest.php b/tests/Unit/PatternFormatterTest.php index 2b491d1..713b68d 100644 --- a/tests/Unit/PatternFormatterTest.php +++ b/tests/Unit/PatternFormatterTest.php @@ -14,10 +14,12 @@ use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\Attributes\Test; use PHPUnit\Framework\TestCase; +use Respect\StringFormatter\Internal\CompiledPattern; use Respect\StringFormatter\InvalidFormatterException; use Respect\StringFormatter\PatternFormatter; #[CoversClass(PatternFormatter::class)] +#[CoversClass(CompiledPattern::class)] final class PatternFormatterTest extends TestCase { #[Test] @@ -313,6 +315,11 @@ public static function providerForUnicodeSupport(): array 'ábc', 'ñ-ábc', ], + 'unicode edge-case' => [ + '0-0-0', + 'ⅫⅫⅫ', + 'Ⅻ-Ⅻ-Ⅻ', + ], ]; }