Page MenuHomePhorge

regex.c
No OneTemporary

Size
76 KB
Referenced Files
None
Subscribers
None
// TODO: cleanup module, make cleaner
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "ant.h"
#include "utf8.h"
#include "errors.h"
#include "runtime.h"
#include "internal.h"
#include "utils.h"
#include "escape.h"
#include "descriptors.h"
#include "silver/engine.h"
#include "modules/regex.h"
#include "modules/symbol.h"
#include "gc/objects.h"
#include <pcre2.h>
typedef struct {
ant_object_t *obj;
pcre2_code *code;
pcre2_match_data *match_data;
bool jit_ready;
} regex_cache_entry_t;
enum {
REGEXP_FLAG_HAS_INDICES = 1 << 0,
REGEXP_FLAG_GLOBAL = 1 << 1,
REGEXP_FLAG_IGNORE_CASE = 1 << 2,
REGEXP_FLAG_MULTILINE = 1 << 3,
REGEXP_FLAG_DOTALL = 1 << 4,
REGEXP_FLAG_UNICODE = 1 << 5,
REGEXP_FLAG_UNICODE_SET = 1 << 6,
REGEXP_FLAG_STICKY = 1 << 7,
};
static regex_cache_entry_t *regex_cache = NULL;
static ant_value_t regexp_matchall_iter_proto_val = 0;
static size_t regex_cache_count = 0;
static size_t regex_cache_cap = 0;
static inline uint8_t regexp_parse_flags_mask(const char *fstr, ant_offset_t flen) {
uint8_t mask = 0;
for (ant_offset_t k = 0; k < flen; k++) {
switch (fstr[k]) {
case 'd': mask |= REGEXP_FLAG_HAS_INDICES; break;
case 'g': mask |= REGEXP_FLAG_GLOBAL; break;
case 'i': mask |= REGEXP_FLAG_IGNORE_CASE; break;
case 'm': mask |= REGEXP_FLAG_MULTILINE; break;
case 's': mask |= REGEXP_FLAG_DOTALL; break;
case 'u': mask |= REGEXP_FLAG_UNICODE; break;
case 'v': mask |= REGEXP_FLAG_UNICODE_SET; break;
case 'y': mask |= REGEXP_FLAG_STICKY; break;
default: break;
}}
return mask;
}
static inline uint8_t regexp_flags_mask(ant_t *js, ant_value_t regexp) {
ant_offset_t flags_off = lkp(js, regexp, "flags", 5);
if (flags_off == 0) return 0;
ant_value_t flags_val = js_propref_load(js, flags_off);
if (vtype(flags_val) != T_STR) return 0;
ant_value_t cached_flags = js_get_slot(regexp, SLOT_REGEXP_FLAGS_STRING);
ant_value_t cached = js_get_slot(regexp, SLOT_REGEXP_FLAGS_MASK);
if (flags_val == cached_flags && vtype(cached) == T_NUM) return (uint8_t)tod(cached);
ant_offset_t flen, foff = vstr(js, flags_val, &flen);
uint8_t mask = regexp_parse_flags_mask((const char *)(uintptr_t)foff, flen);
js_set_slot(regexp, SLOT_REGEXP_FLAGS_MASK, tov((double)mask));
js_set_slot(regexp, SLOT_REGEXP_FLAGS_STRING, flags_val);
return mask;
}
static ant_value_t regexp_build_named_groups_meta(ant_t *js, pcre2_code *code) {
uint32_t namecount = 0;
pcre2_pattern_info(code, PCRE2_INFO_NAMECOUNT, &namecount);
if (namecount == 0) return js_mkundef();
uint32_t nameentrysize = 0;
PCRE2_SPTR nametable = NULL;
pcre2_pattern_info(code, PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize);
pcre2_pattern_info(code, PCRE2_INFO_NAMETABLE, (void *)&nametable);
ant_value_t meta = js_mkarr(js);
if (is_err(meta)) return meta;
PCRE2_SPTR tabptr = nametable;
for (uint32_t i = 0; i < namecount; i++) {
int n = (tabptr[0] << 8) | tabptr[1];
const char *name = (const char *)(tabptr + 2);
ant_value_t name_val = js_mkstr(js, name, strlen(name));
if (is_err(name_val)) return name_val;
js_arr_push(js, meta, name_val);
js_arr_push(js, meta, tov((double)n));
tabptr += nameentrysize;
}
return meta;
}
static void update_regexp_statics(ant_t *js, const char *str_ptr, PCRE2_SIZE *ovector, uint32_t ovcount) {
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
if (is_err(regexp_ctor) || vtype(regexp_ctor) == T_UNDEF) return;
ant_value_t empty = js_mkstr(js, "", 0);
for (int i = 1; i <= 9; i++) {
char key[3] = {'$', (char)('0' + i), '\0'};
ant_value_t val = empty;
if ((uint32_t)i < ovcount && ovector[2*i] != PCRE2_UNSET)
val = js_mkstr(js, str_ptr + ovector[2*i], ovector[2*i+1] - ovector[2*i]);
if (is_err(setprop_cstr(js, regexp_ctor, key, 2, val))) return;
}
ant_value_t match0 = empty;
if (ovcount > 0 && ovector[0] != PCRE2_UNSET)
match0 = js_mkstr(js, str_ptr + ovector[0], ovector[1] - ovector[0]);
if (is_err(setprop_cstr(js, regexp_ctor, "lastMatch", 9, match0))) return;
(void)setprop_cstr(js, regexp_ctor, "$&", 2, match0);
}
static inline bool is_pcre2_passthrough_escape(char c) {
switch (c) {
case 'd': case 'D': case 'w': case 'W': case 's': case 'S':
case 'b': case 'B': case 'n': case 'r': case 't': case 'f':
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
case '.': case '*': case '+': case '?':
case '(': case ')': case '[': case ']':
case '{': case '}': case '|': case '^':
case '$': case '\\': case '/': case '-': return true;
default: return false;
}}
static inline bool is_class_shorthand(char c) {
return c == 'w' || c == 'W' || c == 'd' || c == 'D' || c == 's' || c == 'S';
}
static size_t v_close_bracket(const char *src, size_t src_len, size_t open) {
int depth = 0;
for (size_t i = open; i < src_len; i++) {
if (src[i] == '\\' && i + 1 < src_len) { i++; continue; }
if (src[i] == '[') depth++;
else if (src[i] == ']') { if (--depth == 0) return i; }
}
return src_len;
}
static size_t v_translate_part(const char *p, size_t len, char *out, size_t out_size) {
if (len && p[0] == '[') return js_to_pcre2_pattern(p, len, out, out_size, false);
char tmp[1024];
if (len >= sizeof(tmp) - 2) return 0;
tmp[0] = '['; memcpy(tmp + 1, p, len); tmp[len + 1] = ']';
return js_to_pcre2_pattern(tmp, len + 2, out, out_size, false);
}
static int v_set_op(const char *src, size_t start, size_t end, size_t *op_pos) {
int depth = 0;
for (size_t i = start; i < end; ) {
if (src[i] == '\\' && i + 1 < end) {
char n = src[i + 1];
if ((n == 'p' || n == 'P') && i + 2 < end && src[i + 2] == '{') {
i += 3; while (i < end && src[i] != '}') i++; if (i < end) i++; continue;
}
if ((n == 'u' || n == 'x') && i + 2 < end && src[i + 2] == '{') {
i += 3; while (i < end && src[i] != '}') i++; if (i < end) i++; continue;
}
i += 2; continue;
}
if (src[i] == '[') { depth++; i++; continue; }
if (src[i] == ']') { if (depth > 0) { depth--; i++; continue; } break; }
if (!depth && i + 1 < end) {
if (src[i] == '&' && src[i+1] == '&') { *op_pos = i; return 1; }
if (src[i] == '-' && src[i+1] == '-') { *op_pos = i; return 2; }
}
i++;
}
return 0;
}
size_t js_to_pcre2_pattern(const char *src, size_t src_len, char *dst, size_t dst_size, bool v_flag) {
size_t di = 0;
int charclass_depth = 0;
#define OUT(ch) do { if (di < dst_size - 1) dst[di++] = (ch); } while(0)
for (size_t si = 0; si < src_len && di < dst_size - 1; si++) {
if (src[si] == '[') {
if (v_flag && charclass_depth == 0) {
size_t close = v_close_bracket(src, src_len, si);
size_t op_pos;
int op_type = v_set_op(src, si + 1, close, &op_pos);
if (op_type && close < src_len) {
char ao[1024], bo[1024];
size_t aol = v_translate_part(&src[si + 1], op_pos - si - 1, ao, sizeof(ao));
size_t bol = v_translate_part(&src[op_pos + 2], close - op_pos - 2, bo, sizeof(bo));
const char *la = op_type == 1 ? ao : bo, *ra = op_type == 1 ? bo : ao;
size_t ll = op_type == 1 ? aol : bol, rl = op_type == 1 ? bol : aol;
OUT('('); OUT('?'); OUT(op_type == 1 ? '=' : '!');
for (size_t k = 0; k < ll; k++) OUT(la[k]);
OUT(')');
for (size_t k = 0; k < rl; k++) OUT(ra[k]);
si = close;
continue;
}
}
charclass_depth++;
OUT('[');
continue;
}
if (src[si] == ']' && charclass_depth > 0) {
charclass_depth--;
OUT(']');
continue;
}
if (charclass_depth > 0 && src[si] == '-' && si > 0 && src[si - 1] != '[' &&
si + 1 < src_len && src[si + 1] != ']') {
bool prev_is_shorthand = (si >= 2 && src[si - 2] == '\\' && is_class_shorthand(src[si - 1]));
bool next_is_shorthand = (si + 2 < src_len && src[si + 1] == '\\' && is_class_shorthand(src[si + 2]));
if (prev_is_shorthand || next_is_shorthand) {
OUT('\\'); OUT('-');
continue;
}
OUT('-');
continue;
}
if (src[si] != '\\' || si + 1 >= src_len) {
OUT(src[si]);
continue;
}
char next = src[si + 1];
if (next == 'v') {
OUT('\\'); OUT('x'); OUT('{'); OUT('0'); OUT('b'); OUT('}');
si++;
continue;
}
if (next == 'u' && si + 2 < src_len && src[si + 2] == '{') {
size_t brace_start = si + 3;
size_t brace_end = brace_start;
while (brace_end < src_len && src[brace_end] != '}' && is_xdigit(src[brace_end])) brace_end++;
if (brace_end < src_len && src[brace_end] == '}' && brace_end > brace_start) {
OUT('\\'); OUT('x'); OUT('{');
for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
OUT('}');
si = brace_end;
continue;
}
}
if (next == 'u' && si + 5 < src_len &&
is_xdigit(src[si+2]) && is_xdigit(src[si+3]) &&
is_xdigit(src[si+4]) && is_xdigit(src[si+5])) {
OUT('\\'); OUT('x'); OUT('{');
OUT(src[si+2]); OUT(src[si+3]); OUT(src[si+4]); OUT(src[si+5]);
OUT('}');
si += 5;
continue;
}
if (next == 'u') {
si++;
OUT('u');
continue;
}
if (next == 'x' && si + 3 < src_len &&
is_xdigit(src[si+2]) && is_xdigit(src[si+3])) {
OUT('\\'); OUT('x'); OUT(src[si+2]); OUT(src[si+3]);
si += 3;
continue;
}
if (next == 'x') {
si++;
OUT('x');
continue;
}
if (next == '0' && (si + 2 >= src_len || src[si+2] < '0' || src[si+2] > '9')) {
OUT('\\'); OUT('x'); OUT('{'); OUT('0'); OUT('}');
si++;
continue;
}
if (next >= '0' && next <= '7') {
unsigned int octal = next - '0';
size_t advance = 1;
if (si + 2 < src_len && src[si+2] >= '0' && src[si+2] <= '7') {
octal = octal * 8 + (src[si+2] - '0');
advance = 2;
if (si + 3 < src_len && src[si+3] >= '0' && src[si+3] <= '7' && octal * 8 + (src[si+3] - '0') <= 255) {
octal = octal * 8 + (src[si+3] - '0');
advance = 3;
}
}
if (advance > 1 || next == '0') {
char hex[8];
int hlen = snprintf(hex, sizeof(hex), "\\x{%02x}", octal);
for (int k = 0; k < hlen && di < dst_size - 1; k++) OUT(hex[k]);
si += advance;
continue;
}
}
if (next == 'c' && si + 2 < src_len &&
((src[si+2] >= 'A' && src[si+2] <= 'Z') || (src[si+2] >= 'a' && src[si+2] <= 'z'))) {
OUT('\\'); OUT('c'); OUT(src[si+2]);
si += 2;
continue;
}
if (next == 'c') {
OUT('\\'); OUT('\\'); OUT('c');
si++;
continue;
}
if ((next == 'p' || next == 'P') && si + 2 < src_len && src[si + 2] == '{') {
size_t brace_start = si + 3;
size_t brace_end = brace_start;
while (brace_end < src_len && src[brace_end] != '}') brace_end++;
if (brace_end < src_len && src[brace_end] == '}') {
const char *prop = &src[brace_start];
size_t prop_len = brace_end - brace_start;
static const struct { const char *name; const char *code; } gc_map[] = {
{"Letter","L"},{"Cased_Letter","LC"},{"Uppercase_Letter","Lu"},
{"Lowercase_Letter","Ll"},{"Titlecase_Letter","Lt"},
{"Modifier_Letter","Lm"},{"Other_Letter","Lo"},
{"Mark","M"},{"Nonspacing_Mark","Mn"},{"Spacing_Mark","Mc"},
{"Enclosing_Mark","Me"},
{"Number","N"},{"Decimal_Number","Nd"},{"Letter_Number","Nl"},
{"Other_Number","No"},
{"Punctuation","P"},{"Connector_Punctuation","Pc"},
{"Dash_Punctuation","Pd"},{"Open_Punctuation","Ps"},
{"Close_Punctuation","Pe"},{"Initial_Punctuation","Pi"},
{"Final_Punctuation","Pf"},{"Other_Punctuation","Po"},
{"Symbol","S"},{"Math_Symbol","Sm"},{"Currency_Symbol","Sc"},
{"Modifier_Symbol","Sk"},{"Other_Symbol","So"},
{"Separator","Z"},{"Space_Separator","Zs"},
{"Line_Separator","Zl"},{"Paragraph_Separator","Zp"},
{"Other","C"},{"Control","Cc"},{"Format","Cf"},
{"Surrogate","Cs"},{"Private_Use","Co"},{"Unassigned","Cn"},
};
static const struct { const char *script; const char *range; } u17_scripts[] = {
{"Sidetic", "\\x{10940}-\\x{1095F}"},
{"Garay", "\\x{10D40}-\\x{10D8F}"},
{"Gurung_Khema", "\\x{16100}-\\x{1613F}"},
{"Kirat_Rai", "\\x{16D40}-\\x{16D7F}"},
{"Ol_Onal", "\\x{1E5D0}-\\x{1E5FF}"},
{"Sunuwar", "\\x{11BC0}-\\x{11BFF}"},
{"Tulu_Tigalari", "\\x{11380}-\\x{113FF}"},
};
bool has_eq = (memchr(prop, '=', prop_len) != NULL);
bool has_colon = (memchr(prop, ':', prop_len) != NULL);
if (!has_eq && !has_colon && next == 'p' && charclass_depth == 0) {
static const struct { const char *name; const char *exp; } sprops[] = {
{"Emoji_Keycap_Sequence",
"(?:\\x{23}\\x{fe0f}\\x{20e3}|\\x{2a}\\x{fe0f}\\x{20e3}|[\\x{30}-\\x{39}]\\x{fe0f}\\x{20e3})"},
{"RGI_Emoji",
"(?:[\\x{1f1e6}-\\x{1f1ff}]{2}|(?:\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]?\\x{200d})+\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]?|\\p{Emoji}[\\x{1f3fb}-\\x{1f3ff}]|\\p{Emoji}\\x{fe0f}?)"},
};
for (size_t m = 0; m < sizeof(sprops)/sizeof(sprops[0]); m++) {
if (strlen(sprops[m].name) == prop_len && memcmp(sprops[m].name, prop, prop_len) == 0) {
for (const char *r = sprops[m].exp; *r && di < dst_size - 1; r++) OUT(*r);
si = brace_end;
goto next_char;
}
}
}
if (has_eq || has_colon) {
char sep = has_eq ? '=' : ':';
const char *val = memchr(prop, sep, prop_len);
if (val) {
val++;
size_t val_len = prop_len - (size_t)(val - prop);
for (size_t m = 0; m < sizeof(u17_scripts)/sizeof(u17_scripts[0]); m++) {
if (strlen(u17_scripts[m].script) == val_len &&
memcmp(u17_scripts[m].script, val, val_len) == 0) {
const char *r = u17_scripts[m].range;
OUT('[');
if (next == 'P') OUT('^');
for (; *r; r++) OUT(*r);
OUT(']');
si = brace_end;
goto next_char;
}
}
}
}
if (!has_eq && !has_colon) {
static const struct { const char *name; const char *range; } rangeprops[] = {
{"ASCII", "\\x{0}-\\x{7f}"},
{"Any", "\\x{0}-\\x{10ffff}"},
};
for (size_t m = 0; m < sizeof(rangeprops)/sizeof(rangeprops[0]); m++) {
if (strlen(rangeprops[m].name) == prop_len && memcmp(rangeprops[m].name, prop, prop_len) == 0) {
if (charclass_depth > 0) {
for (const char *r = rangeprops[m].range; *r; r++) OUT(*r);
} else {
OUT('['); if (next == 'P') OUT('^');
for (const char *r = rangeprops[m].range; *r; r++) OUT(*r);
OUT(']');
}
si = brace_end;
goto next_char;
}
}
}
const char *replacement = NULL;
if (!has_eq && !has_colon) {
for (size_t m = 0; m < sizeof(gc_map)/sizeof(gc_map[0]); m++) {
if (strlen(gc_map[m].name) == prop_len &&
memcmp(gc_map[m].name, prop, prop_len) == 0) {
replacement = gc_map[m].code;
break;
}
}
}
static const struct { const char *prop; const char *extra; } u17_props[] = {
{"Emoji", "\\x{1FACD}-\\x{1FACE}\\x{1FAE9}\\x{1FAF9}"},
};
const char *extra_range = NULL;
if (!has_eq && !has_colon && !replacement) {
for (size_t m = 0; m < sizeof(u17_props)/sizeof(u17_props[0]); m++) {
if (strlen(u17_props[m].prop) == prop_len &&
memcmp(u17_props[m].prop, prop, prop_len) == 0) {
extra_range = u17_props[m].extra;
break;
}
}
}
if (extra_range && charclass_depth == 0) {
const char *pfx = (next == 'p') ? "(?:\\p{" : "(?:\\P{";
for (const char *r = pfx; *r; r++) OUT(*r);
for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
OUT('}'); OUT('|'); OUT('[');
if (next == 'P') OUT('^');
for (const char *r = extra_range; *r; r++) OUT(*r);
OUT(']'); OUT(')');
} else {
OUT('\\'); OUT(next); OUT('{');
if (replacement) {
for (const char *r = replacement; *r; r++) OUT(*r);
} else {
for (size_t k = brace_start; k < brace_end; k++) OUT(src[k]);
}
OUT('}');
}
si = brace_end;
continue;
}
OUT('\\'); OUT(next);
si++;
continue;
}
if (is_pcre2_passthrough_escape(next)) {
OUT('\\'); OUT(next);
si++;
continue;
}
si++;
OUT(next);
next_char:;
}
#undef OUT
dst[di] = '\0';
return di;
}
#define REGEXP_SET_PROP(js, obj, key, klen, val, is_new) \
((is_new) ? js_mkprop_fast(js, obj, key, klen, val) \
: js_setprop(js, obj, js_mkstr(js, key, klen), val))
static void regexp_init_flags(ant_t *js, ant_value_t obj, const char *fstr, ant_offset_t flen, bool is_new) {
uint8_t mask = regexp_parse_flags_mask(fstr, flen);
bool d = (mask & REGEXP_FLAG_HAS_INDICES) != 0;
bool g = (mask & REGEXP_FLAG_GLOBAL) != 0;
bool i = (mask & REGEXP_FLAG_IGNORE_CASE) != 0;
bool m = (mask & REGEXP_FLAG_MULTILINE) != 0;
bool s = (mask & REGEXP_FLAG_DOTALL) != 0;
bool u = (mask & REGEXP_FLAG_UNICODE) != 0;
bool v = (mask & REGEXP_FLAG_UNICODE_SET) != 0;
bool y = (mask & REGEXP_FLAG_STICKY) != 0;
char sorted[10]; int si = 0;
if (d) sorted[si++] = 'd';
if (g) sorted[si++] = 'g';
if (i) sorted[si++] = 'i';
if (m) sorted[si++] = 'm';
if (s) sorted[si++] = 's';
if (u) sorted[si++] = 'u';
if (v) sorted[si++] = 'v';
if (y) sorted[si++] = 'y';
ant_value_t flags_value = js_mkstr(js, sorted, si);
REGEXP_SET_PROP(js, obj, "flags", 5, flags_value, is_new);
REGEXP_SET_PROP(js, obj, "hasIndices", 10, mkval(T_BOOL, d ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "global", 6, mkval(T_BOOL, g ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "ignoreCase", 10, mkval(T_BOOL, i ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "multiline", 9, mkval(T_BOOL, m ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "dotAll", 6, mkval(T_BOOL, s ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "unicode", 7, mkval(T_BOOL, u ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "unicodeSets", 11, mkval(T_BOOL, v ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "sticky", 6, mkval(T_BOOL, y ? 1 : 0), is_new);
REGEXP_SET_PROP(js, obj, "lastIndex", 9, tov(0), is_new);
js_set_slot(obj, SLOT_REGEXP_FLAGS_MASK, tov((double)mask));
js_set_slot(obj, SLOT_REGEXP_FLAGS_STRING, flags_value);
js_set_slot(obj, SLOT_REGEXP_NAMED_GROUPS, js_mkundef());
}
ant_value_t is_regexp_like(ant_t *js, ant_value_t value) {
if (!is_object_type(value)) return js_false;
ant_value_t match_sym = get_match_sym();
if (vtype(match_sym) == T_SYMBOL) {
ant_value_t match_val = js_get_sym(js, value, match_sym);
if (is_err(match_val)) return match_val;
if (vtype(match_val) != T_UNDEF) return js_bool(js_truthy(js, match_val));
}
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
if (is_err(regexp_ctor)) return regexp_ctor;
ant_value_t regexp_proto = js_get(js, regexp_ctor, "prototype");
if (is_err(regexp_proto)) return regexp_proto;
if (!is_object_type(regexp_proto)) return js_false;
return js_bool(proto_chain_contains(js, value, regexp_proto));
}
static ant_value_t should_regexp_passthrough(ant_t *js, ant_value_t *args, int nargs) {
if (vtype(js->new_target) != T_UNDEF) return js_false;
if (nargs <= 0) return js_false;
if (nargs >= 2 && vtype(args[1]) != T_UNDEF) return js_false;
if (!is_object_type(args[0])) return js_false;
ant_value_t is_re = is_regexp_like(js, args[0]);
if (is_err(is_re)) return is_re;
if (!js_truthy(js, is_re)) return js_false;
ant_value_t ctor = js_getprop_fallback(js, args[0], "constructor");
if (is_err(ctor)) return ctor;
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
if (is_err(regexp_ctor)) return regexp_ctor;
return js_bool(same_ctor_identity(js, ctor, regexp_ctor));
}
ant_value_t reject_regexp_arg(ant_t *js, ant_value_t value, const char *method_name) {
ant_value_t is_re = is_regexp_like(js, value);
if (is_err(is_re)) return is_re;
if (js_truthy(js, is_re)) {
return js_mkerr_typed(js, JS_ERR_TYPE, "First argument to %s must not be a RegExp", method_name);
}
return js_mkundef();
}
static ant_value_t regexp_species_construct(ant_t *js, ant_value_t rx, ant_value_t ctor, ant_value_t *ctor_args, int nargs) {
ant_value_t seed = js_mkobj(js);
if (is_err(seed)) return seed;
ant_value_t proto = js_get(js, ctor, "prototype");
if (is_err(proto)) return proto;
if (is_object_type(proto)) js_set_proto_init(seed, proto);
ant_value_t saved = js->new_target;
js->new_target = ctor;
ant_value_t result = sv_vm_call(js->vm, js, ctor, seed, ctor_args, nargs, NULL, true);
js->new_target = saved;
if (is_err(result)) return result;
if (!is_object_type(result))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp species constructor returned non-object");
return result;
}
static ant_value_t regexp_exec_abstract(ant_t *js, ant_value_t rx, ant_value_t str);
static ant_value_t builtin_regexp_exec(ant_t *js, ant_value_t *args, int nargs);
static regex_cache_entry_t *regex_cache_lookup(ant_object_t *obj) {
for (size_t i = 0; i < regex_cache_count; i++) {
if (regex_cache[i].obj == obj) return &regex_cache[i];
}
return NULL;
}
static regex_cache_entry_t *regex_cache_insert(ant_object_t *obj, pcre2_code *code, pcre2_match_data *match_data, bool jit_ready) {
if (regex_cache_count >= regex_cache_cap) {
size_t new_cap = regex_cache_cap ? regex_cache_cap * 2 : 64;
regex_cache_entry_t *new_cache = realloc(regex_cache, new_cap * sizeof(regex_cache_entry_t));
if (!new_cache) return NULL;
regex_cache = new_cache;
regex_cache_cap = new_cap;
}
regex_cache_entry_t *entry = &regex_cache[regex_cache_count++];
entry->obj = obj;
entry->code = code;
entry->match_data = match_data;
entry->jit_ready = jit_ready;
return entry;
}
typedef struct {
pcre2_code *code;
pcre2_match_data *match_data;
bool jit_ready;
} compiled_regex_t;
static bool regex_get_or_compile(ant_t *js, ant_value_t regexp_obj, compiled_regex_t *out) {
ant_object_t *obj_ptr = js_obj_ptr(regexp_obj);
uint8_t flags_mask = regexp_flags_mask(js, regexp_obj);
regex_cache_entry_t *cached = regex_cache_lookup(obj_ptr);
if (cached) {
out->code = cached->code;
out->match_data = cached->match_data;
out->jit_ready = cached->jit_ready;
return true;
}
ant_offset_t source_off = lkp(js, regexp_obj, "source", 6);
if (source_off == 0) return false;
ant_value_t source_val = js_propref_load(js, source_off);
if (vtype(source_val) != T_STR) return false;
ant_offset_t plen, poff = vstr(js, source_val, &plen);
const char *pattern_ptr = (char *)(uintptr_t)(poff);
char pcre2_pattern[4096];
size_t pcre2_len = js_to_pcre2_pattern(
pattern_ptr, plen, pcre2_pattern, sizeof(pcre2_pattern),
(flags_mask & REGEXP_FLAG_UNICODE_SET) != 0
);
uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
if (flags_mask & REGEXP_FLAG_IGNORE_CASE) options |= PCRE2_CASELESS;
if (flags_mask & REGEXP_FLAG_MULTILINE) options |= PCRE2_MULTILINE;
if (flags_mask & REGEXP_FLAG_DOTALL) options |= PCRE2_DOTALL;
int errcode;
PCRE2_SIZE erroffset;
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
if (re == NULL) return false;
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
bool jit_ready = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE) == 0;
regex_cache_insert(obj_ptr, re, match_data, jit_ready);
ant_value_t groups_meta = regexp_build_named_groups_meta(js, re);
if (is_err(groups_meta)) {
pcre2_match_data_free(match_data);
pcre2_code_free(re);
regex_cache_count--;
return false;
}
js_set_slot(regexp_obj, SLOT_REGEXP_NAMED_GROUPS, groups_meta);
out->code = re;
out->match_data = match_data;
out->jit_ready = jit_ready;
return true;
}
static ant_value_t builtin_RegExp(ant_t *js, ant_value_t *args, int nargs) {
bool pattern_is_regexp = false;
if (nargs > 0) {
ant_value_t is_re = is_regexp_like(js, args[0]);
if (is_err(is_re)) return is_re;
pattern_is_regexp = js_truthy(js, is_re);
}
if (vtype(js->new_target) == T_UNDEF && nargs > 0 && pattern_is_regexp) {
if (nargs < 2 || vtype(args[1]) == T_UNDEF) {
ant_value_t ctor = js_getprop_fallback(js, args[0], "constructor");
if (is_err(ctor)) return ctor;
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
if (is_err(regexp_ctor)) return regexp_ctor;
if (same_ctor_identity(js, ctor, regexp_ctor)) return args[0];
}
}
ant_value_t regexp_obj = js->this_val;
bool use_this = (vtype(js->new_target) != T_UNDEF && vtype(regexp_obj) == T_OBJ);
if (!use_this) {
regexp_obj = mkobj(js, 0);
if (is_err(regexp_obj)) return regexp_obj;
}
ant_value_t regexp_proto = js_get_ctor_proto(js, "RegExp", 6);
ant_value_t instance_proto = js_instance_proto_from_new_target(js, regexp_proto);
if (is_object_type(instance_proto)) js_set_proto_init(regexp_obj, instance_proto);
if (vtype(js->new_target) == T_FUNC || vtype(js->new_target) == T_CFUNC) {
js_set_slot(regexp_obj, SLOT_CTOR, js->new_target);
}
ant_value_t pattern = js_mkstr(js, "", 0);
ant_value_t flags = js_mkstr(js, "", 0);
if (nargs > 0) {
if (pattern_is_regexp) {
ant_value_t src = js_getprop_fallback(js, args[0], "source");
if (is_err(src)) return src;
pattern = js_tostring_val(js, src);
if (is_err(pattern)) return pattern;
if (nargs >= 2 && vtype(args[1]) != T_UNDEF) {
flags = js_tostring_val(js, args[1]);
} else {
ant_value_t fl = js_getprop_fallback(js, args[0], "flags");
if (is_err(fl)) return fl;
flags = js_tostring_val(js, fl);
}
if (is_err(flags)) return flags;
} else if (vtype(args[0]) == T_STR) {
pattern = args[0];
if (nargs > 1 && vtype(args[1]) == T_STR) flags = args[1];
} else if (vtype(args[0]) != T_UNDEF) {
ant_value_t s = js_tostring_val(js, args[0]);
if (is_err(s)) return s;
pattern = s;
if (nargs > 1 && vtype(args[1]) == T_STR) flags = args[1];
}
}
js_mkprop_fast(js, regexp_obj, "source", 6, pattern);
ant_offset_t flags_len, flags_off = vstr(js, flags, &flags_len);
regexp_init_flags(js, regexp_obj, (const char *)(uintptr_t)(flags_off), flags_len, true);
return regexp_obj;
}
static ant_value_t builtin_regexp_groups_getter(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t result_arr = js->this_val;
if (!is_object_type(result_arr)) return js_mkundef();
ant_value_t cached = js_get_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE);
if (is_object_type(cached)) return cached;
ant_value_t meta = js_get_slot(result_arr, SLOT_REGEXP_RESULT_GROUPS);
if (!is_object_type(meta)) return js_mkundef();
ant_value_t groups = js_mkobj(js);
if (is_err(groups)) return groups;
js_set_proto_init(groups, js_mknull());
for (ant_offset_t i = 0; ; i += 2) {
ant_value_t name = js_arr_get(js, meta, i);
if (vtype(name) == T_UNDEF) break;
ant_value_t index_val = js_arr_get(js, meta, i + 1);
ant_offset_t index = (vtype(index_val) == T_NUM) ? (ant_offset_t)tod(index_val) : 0;
char idxstr[16];
(void)uint_to_str(idxstr, sizeof(idxstr), (uint64_t)index);
ant_value_t value = js_getprop_fallback(js, result_arr, idxstr);
ant_offset_t name_len, name_off = vstr(js, name, &name_len);
ant_value_t status = setprop_cstr(js, groups, (const char *)(uintptr_t)name_off, (size_t)name_len, value);
if (is_err(status)) return status;
}
js_set_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE, groups);
return groups;
}
static ant_value_t regexp_build_indices_pair(ant_t *js, PCRE2_SIZE start, PCRE2_SIZE end) {
if (start == PCRE2_UNSET) return js_mkundef();
ant_value_t pair = js_mkarr(js);
if (is_err(pair)) return pair;
js_arr_push(js, pair, tov((double)start));
js_arr_push(js, pair, tov((double)end));
return pair;
}
static ant_value_t regexp_build_indices_groups(
ant_t *js,
ant_value_t groups_meta,
ant_value_t indices_arr
) {
ant_value_t groups = js_mkobj(js);
if (is_err(groups)) return groups;
js_set_proto_init(groups, js_mknull());
for (ant_offset_t i = 0; ; i += 2) {
ant_value_t name = js_arr_get(js, groups_meta, i);
if (vtype(name) == T_UNDEF) break;
ant_value_t index_val = js_arr_get(js, groups_meta, i + 1);
ant_offset_t index = (vtype(index_val) == T_NUM) ? (ant_offset_t)tod(index_val) : 0;
char idxstr[16];
(void)uint_to_str(idxstr, sizeof(idxstr), (uint64_t)index);
ant_value_t value = js_getprop_fallback(js, indices_arr, idxstr);
ant_offset_t name_len, name_off = vstr(js, name, &name_len);
ant_value_t status = setprop_cstr(js, groups, (const char *)(uintptr_t)name_off, (size_t)name_len, value);
if (is_err(status)) return status;
}
return groups;
}
static ant_value_t regexp_build_indices_result(
ant_t *js,
ant_value_t regexp,
PCRE2_SIZE *ovector,
uint32_t ovcount
) {
ant_value_t indices_arr = js_mkarr(js);
if (is_err(indices_arr)) return indices_arr;
for (uint32_t i = 0; i < ovcount && i < 32; i++) {
ant_value_t pair = regexp_build_indices_pair(js, ovector[2*i], ovector[2*i+1]);
if (is_err(pair)) return pair;
js_arr_push(js, indices_arr, pair);
}
ant_value_t groups_meta = js_get_slot(regexp, SLOT_REGEXP_NAMED_GROUPS);
if (is_object_type(groups_meta)) {
ant_value_t groups = regexp_build_indices_groups(js, groups_meta, indices_arr);
if (is_err(groups)) return groups;
if (is_err(setprop_cstr(js, indices_arr, "groups", 6, groups))) return js_mkerr(js, "oom");
} else if (is_err(setprop_cstr(js, indices_arr, "groups", 6, js_mkundef()))) return js_mkerr(js, "oom");
return indices_arr;
}
static ant_value_t regexp_exec_internal(ant_t *js, ant_value_t regexp, ant_value_t str_arg, bool truthy_only) {
ant_offset_t str_len, str_off = vstr(js, str_arg, &str_len);
const char *str_ptr = (char *)(uintptr_t)(str_off);
uint8_t flags_mask = regexp_flags_mask(js, regexp);
bool global_flag = (flags_mask & REGEXP_FLAG_GLOBAL) != 0;
bool has_indices = (flags_mask & REGEXP_FLAG_HAS_INDICES) != 0;
bool sticky_flag = (flags_mask & REGEXP_FLAG_STICKY) != 0;
// TODO: reduce nesting
PCRE2_SIZE start_offset = 0;
if (global_flag || sticky_flag) {
ant_offset_t lastindex_off = lkp(js, regexp, "lastIndex", 9);
if (lastindex_off != 0) {
ant_value_t li_val = js_propref_load(js, lastindex_off);
if (vtype(li_val) == T_NUM) {
double li = tod(li_val);
if (li >= 0 && li <= (double)str_len) start_offset = (PCRE2_SIZE)li;
else {
if (is_err(setprop_cstr(js, regexp, "lastIndex", 9, tov(0)))) return js_mkerr(js, "oom");
return js_mknull();
}
}
}
}
compiled_regex_t compiled;
if (!regex_get_or_compile(js, regexp, &compiled)) return js_mknull();
uint32_t match_options = 0;
if (sticky_flag) match_options |= PCRE2_ANCHORED;
int rc;
if (compiled.jit_ready && !sticky_flag) {
rc = pcre2_jit_match(compiled.code, (PCRE2_SPTR)str_ptr, str_len, start_offset, match_options, compiled.match_data, NULL);
} else rc = pcre2_match(compiled.code, (PCRE2_SPTR)str_ptr, str_len, start_offset, match_options, compiled.match_data, NULL);
if (rc < 0) {
if ((global_flag || sticky_flag) && is_err(setprop_cstr(js, regexp, "lastIndex", 9, tov(0)))) {
return js_mkerr(js, "oom");
}
return js_mknull();
}
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(compiled.match_data);
uint32_t ovcount = pcre2_get_ovector_count(compiled.match_data);
update_regexp_statics(js, str_ptr, ovector, ovcount);
if (global_flag || sticky_flag) {
ant_value_t next_idx = tov((double)ovector[1]);
if (is_err(setprop_cstr(js, regexp, "lastIndex", 9, next_idx))) return js_mkerr(js, "oom");
}
if (truthy_only) return js_true;
ant_value_t result_arr = js_mkarr(js);
if (is_err(result_arr)) return result_arr;
for (uint32_t i = 0; i < ovcount && i < 32; i++) {
PCRE2_SIZE start = ovector[2*i];
PCRE2_SIZE end = ovector[2*i+1];
if (start == PCRE2_UNSET) {
js_arr_push(js, result_arr, js_mkundef());
} else {
ant_value_t match_str = js_mkstr(js, str_ptr + start, end - start);
js_arr_push(js, result_arr, match_str);
}
}
if (is_err(setprop_cstr(js, result_arr, "index", 5, tov((double)ovector[0])))) return js_mkerr(js, "oom");
if (is_err(setprop_cstr(js, result_arr, "input", 5, str_arg))) return js_mkerr(js, "oom");
ant_value_t groups_meta = js_get_slot(regexp, SLOT_REGEXP_NAMED_GROUPS);
if (is_object_type(groups_meta)) {
js_set_slot(result_arr, SLOT_REGEXP_RESULT_GROUPS, groups_meta);
js_set_slot(result_arr, SLOT_REGEXP_GROUPS_CACHE, js_mkundef());
js_set_getter_desc(js, js_as_obj(result_arr), "groups", 6, js_mkfun(builtin_regexp_groups_getter), JS_DESC_E | JS_DESC_C);
} else if (is_err(setprop_cstr(js, result_arr, "groups", 6, js_mkundef()))) return js_mkerr(js, "oom");
if (has_indices) {
ant_value_t indices = regexp_build_indices_result(js, regexp, ovector, ovcount);
if (is_err(indices)) return indices;
if (is_err(setprop_cstr(js, result_arr, "indices", 7, indices))) return js_mkerr(js, "oom");
}
return result_arr;
}
static ant_value_t builtin_regexp_exec(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t regexp = js->this_val;
if (vtype(regexp) != T_OBJ) return js_mkerr(js, "exec called on non-regexp");
if (nargs < 1) return js_mknull();
ant_value_t str_arg = args[0];
if (vtype(str_arg) != T_STR) return js_mknull();
return regexp_exec_internal(js, regexp, str_arg, false);
}
static ant_value_t builtin_regexp_toString(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t regexp = js->this_val;
if (!is_object_type(regexp))
return js_mkerr_typed(js, JS_ERR_TYPE, "toString called on non-object");
ant_value_t source_val = js_getprop_fallback(js, regexp, "source");
if (is_err(source_val)) return source_val;
ant_value_t source_str = js_tostring_val(js, source_val);
if (is_err(source_str)) return source_str;
ant_value_t flags_val = js_getprop_fallback(js, regexp, "flags");
if (is_err(flags_val)) return flags_val;
ant_value_t flags_str = js_tostring_val(js, flags_val);
if (is_err(flags_str)) return flags_str;
ant_offset_t src_len, src_off = vstr(js, source_str, &src_len);
ant_offset_t fl_len, fl_off = vstr(js, flags_str, &fl_len);
size_t total = 1 + src_len + 1 + fl_len;
char *buf = ant_calloc(total + 1);
if (!buf) return js_mkerr(js, "oom");
size_t n = 0;
buf[n++] = '/';
memcpy(buf + n, (const void *)(uintptr_t)src_off, src_len); n += src_len;
buf[n++] = '/';
memcpy(buf + n, (const void *)(uintptr_t)fl_off, fl_len); n += fl_len;
ant_value_t result = js_mkstr(js, buf, n);
free(buf);
return result;
}
static ant_value_t builtin_regexp_compile(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "compile called on non-object");
ant_value_t pattern = js_mkstr(js, "", 0);
ant_value_t flags = js_mkstr(js, "", 0);
if (nargs > 0 && vtype(args[0]) != T_UNDEF) {
ant_value_t is_re = is_regexp_like(js, args[0]);
if (is_err(is_re)) return is_re;
if (js_truthy(js, is_re)) {
ant_value_t src = js_getprop_fallback(js, args[0], "source");
if (is_err(src)) return src;
pattern = js_tostring_val(js, src);
if (is_err(pattern)) return pattern;
ant_value_t fl = js_getprop_fallback(js, args[0], "flags");
if (is_err(fl)) return fl;
flags = js_tostring_val(js, fl);
if (is_err(flags)) return flags;
} else {
pattern = js_tostring_val(js, args[0]);
if (is_err(pattern)) return pattern;
}
}
if (nargs > 1 && vtype(args[1]) != T_UNDEF) {
flags = js_tostring_val(js, args[1]);
if (is_err(flags)) return flags;
}
js_setprop(js, rx, js_mkstr(js, "source", 6), pattern);
ant_offset_t flen, foff = vstr(js, flags, &flen);
regexp_init_flags(js, rx, (const char *)(uintptr_t)(foff), flen, false);
ant_object_t *rx_ptr = js_obj_ptr(rx);
for (size_t i = 0; i < regex_cache_count; i++) {
if (regex_cache[i].obj == rx_ptr) {
pcre2_match_data_free(regex_cache[i].match_data);
pcre2_code_free(regex_cache[i].code);
regex_cache[i] = regex_cache[--regex_cache_count];
break;
}
}
return rx;
}
static inline bool is_syntax_char(char c) {
return
c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
c == '+' || c == '?' || c == '(' || c == ')' || c == '[' ||
c == ']' || c == '{' || c == '}' || c == '|' || c == '/';
}
static inline bool is_other_punctuator(char c) {
return
c == ',' || c == '-' || c == ':' || c == ';' || c == '<' ||
c == '=' || c == '>' || c == '@' || c == '!' || c == '"' ||
c == '#' || c == '%' || c == '&' || c == '\'' || c == '`' || c == '~';
}
static ant_value_t builtin_regexp_escape(ant_t *js, ant_value_t *args, int nargs) {
if (nargs < 1 || vtype(args[0]) != T_STR)
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.escape requires a string argument");
ant_offset_t slen, soff = vstr(js, args[0], &slen);
const char *src = (const char *)(uintptr_t)(soff);
size_t buf_cap = slen * 6 + 1;
char *buf = ant_calloc(buf_cap);
if (!buf) return js_mkerr(js, "oom");
size_t di = 0;
bool first = true;
for (size_t si = 0; si < slen; ) {
unsigned char c = (unsigned char)src[si];
if (c >= 0x80) {
utf8proc_int32_t cp;
int bytes = (int)utf8_next(
(const utf8proc_uint8_t *)&src[si],
(utf8proc_ssize_t)(slen - si), &cp
);
for (int b = 0; b < bytes && si < slen; b++)
buf[di++] = src[si++];
first = false;
continue;
}
if (first && ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
di += snprintf(buf + di, buf_cap - di, "\\x%02x", c);
si++; first = false;
continue;
}
if (is_syntax_char(c)) {
buf[di++] = '\\'; buf[di++] = c;
si++; first = false;
continue;
}
if (is_other_punctuator(c) || c == ' ' || c == '\t' || c == '\n' ||
c == '\r' || c == '\v' || c == '\f') {
di += snprintf(buf + di, buf_cap - di, "\\x%02x", c);
si++; first = false;
continue;
}
buf[di++] = c;
si++; first = false;
}
ant_value_t result = js_mkstr(js, buf, di);
free(buf);
return result;
}
static ant_value_t regexp_exec_with_exec_fn(ant_t *js, ant_value_t rx, ant_value_t str, ant_value_t exec_fn) {
if (vtype(exec_fn) == T_FUNC || vtype(exec_fn) == T_CFUNC) {
ant_value_t call_args[1] = { str };
ant_value_t result = sv_vm_call(js->vm, js, exec_fn, rx, call_args, 1, NULL, false);
if (is_err(result)) return result;
if (!is_object_type(result) && vtype(result) != T_NULL)
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp exec returned non-object");
return result;
}
ant_value_t call_args[1] = { str };
ant_value_t saved = js->this_val;
js->this_val = rx;
ant_value_t result = builtin_regexp_exec(js, call_args, 1);
js->this_val = saved;
return result;
}
static ant_value_t regexp_exec_abstract(ant_t *js, ant_value_t rx, ant_value_t str) {
ant_value_t exec_fn = js_get(js, rx, "exec");
if (is_err(exec_fn)) return exec_fn;
return regexp_exec_with_exec_fn(js, rx, str, exec_fn);
}
bool regexp_exec_truthy_try_fast(
ant_t *js,
ant_value_t call_func,
ant_value_t regexp,
ant_value_t arg,
ant_value_t *out_result
) {
if (!out_result || vtype(call_func) != T_CFUNC) return false;
if (!js_cfunc_same_entrypoint(call_func, builtin_regexp_exec)) return false;
if (!is_object_type(regexp) || vtype(arg) != T_STR) return false;
ant_value_t result = regexp_exec_internal(js, regexp, arg, true);
if (is_err(result)) {
*out_result = result;
return true;
}
*out_result = mkval(T_BOOL, vtype(result) != T_NULL ? 1 : 0);
return true;
}
static ant_value_t builtin_regexp_test(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t regexp = js->this_val;
if (!is_object_type(regexp))
return js_mkerr_typed(js, JS_ERR_TYPE, "test called on non-object");
ant_value_t str_arg = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
if (is_err(str_arg)) return str_arg;
ant_value_t exec_fn = js_get(js, regexp, "exec");
if (is_err(exec_fn)) return exec_fn;
ant_value_t result;
if (vtype(exec_fn) == T_CFUNC && js_cfunc_same_entrypoint(exec_fn, builtin_regexp_exec)) {
result = regexp_exec_internal(js, regexp, str_arg, true);
} else result = regexp_exec_with_exec_fn(js, regexp, str_arg, exec_fn);
if (is_err(result)) return result;
return mkval(T_BOOL, vtype(result) != T_NULL ? 1 : 0);
}
static ant_value_t builtin_regexp_flags_getter(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype.flags called on non-object");
char buf[16]; int n = 0;
ant_value_t v = js_getprop_fallback(js, rx, "hasIndices");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'd';
v = js_getprop_fallback(js, rx, "global");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'g';
v = js_getprop_fallback(js, rx, "ignoreCase");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'i';
v = js_getprop_fallback(js, rx, "multiline");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'm';
v = js_getprop_fallback(js, rx, "dotAll");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 's';
v = js_getprop_fallback(js, rx, "unicode");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'u';
v = js_getprop_fallback(js, rx, "unicodeSets");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'v';
v = js_getprop_fallback(js, rx, "sticky");
if (is_err(v)) return v;
if (js_truthy(js, v)) buf[n++] = 'y';
return js_mkstr(js, buf, n);
}
static ant_value_t builtin_regexp_symbol_match(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@match] called on non-object");
ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
if (is_err(str)) return str;
ant_value_t global_val = js_getprop_fallback(js, rx, "global");
if (is_err(global_val)) return global_val;
if (!js_truthy(js, global_val))
return regexp_exec_abstract(js, rx, str);
ant_value_t unicode_val = js_getprop_fallback(js, rx, "unicode");
if (is_err(unicode_val)) return unicode_val;
bool full_unicode = js_truthy(js, unicode_val);
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
ant_value_t A = js_mkarr(js);
if (is_err(A)) return A;
ant_offset_t n = 0;
for (;;) {
ant_value_t result = regexp_exec_abstract(js, rx, str);
if (is_err(result)) return result;
if (vtype(result) == T_NULL) return n == 0 ? js_mknull() : mkval(T_ARR, vdata(A));
ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
if (is_err(match_str)) return match_str;
js_arr_push(js, A, match_str);
n++;
ant_offset_t mlen;
vstr(js, match_str, &mlen);
if (mlen == 0) {
ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
if (is_err(li_val)) return li_val;
double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
double advance = 1;
if (full_unicode && li < (double)str_len) {
advance = (double)utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, (ant_offset_t)li);
} js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + advance));
}
}
}
static ant_value_t regexp_matchall_next(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t iter = js->this_val;
ant_value_t rx = js_get_slot(iter, SLOT_MATCHALL_RX);
ant_value_t str = js_get_slot(iter, SLOT_MATCHALL_STR);
ant_value_t done_val = js_get_slot(iter, SLOT_MATCHALL_DONE);
if (js_truthy(js, done_val))
return js_iter_result(js, false, js_mkundef());
ant_value_t result = regexp_exec_abstract(js, rx, str);
if (is_err(result)) return result;
if (vtype(result) == T_NULL) {
js_set_slot(iter, SLOT_MATCHALL_DONE, js_true);
return js_iter_result(js, false, js_mkundef());
}
ant_value_t global_val = js_getprop_fallback(js, rx, "global");
if (js_truthy(js, global_val)) {
ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
if (is_err(match_str)) return match_str;
ant_offset_t mlen;
vstr(js, match_str, &mlen);
if (mlen == 0) {
ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + 1));
}
} else js_set_slot(iter, SLOT_MATCHALL_DONE, js_true);
return js_iter_result(js, true, result);
}
static ant_value_t builtin_regexp_symbol_matchAll(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@matchAll] called on non-object");
ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
if (is_err(str)) return str;
ant_value_t flags_val = js_getprop_fallback(js, rx, "flags");
if (is_err(flags_val)) return flags_val;
ant_value_t flags_str = js_tostring_val(js, flags_val);
if (is_err(flags_str)) return flags_str;
ant_value_t source_val = js_getprop_fallback(js, rx, "source");
if (is_err(source_val)) return source_val;
ant_value_t ctor_args[2] = { source_val, flags_str };
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
ant_value_t new_rx = sv_vm_call(js->vm, js, regexp_ctor, js_mkundef(), ctor_args, 2, NULL, true);
if (is_err(new_rx)) return new_rx;
ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
js_setprop(js, new_rx, js_mkstr(js, "lastIndex", 9), li_val);
ant_value_t iter = js_mkobj(js);
js_set_slot(iter, SLOT_MATCHALL_RX, new_rx);
js_set_slot(iter, SLOT_MATCHALL_STR, str);
js_set_slot(iter, SLOT_MATCHALL_DONE, js_false);
js_set_proto_init(iter, regexp_matchall_iter_proto_val);
return iter;
}
static ant_value_t builtin_string_matchAll(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
ant_value_t str = js_tostring_val(js, this_unwrapped);
if (is_err(str)) return str;
if (nargs < 1) return js_mkerr_typed(js, JS_ERR_TYPE, "matchAll requires at least 1 argument");
if (is_object_type(args[0])) {
ant_value_t is_re = is_regexp_like(js, args[0]);
if (js_truthy(js, is_re)) {
ant_value_t flags_val = js_getprop_fallback(js, args[0], "flags");
if (is_err(flags_val)) return flags_val;
ant_value_t flags_str = js_tostring_val(js, flags_val);
ant_offset_t flen, foff = vstr(js, flags_str, &flen);
const char *fp = (const char *)(uintptr_t)(foff);
bool has_g = false;
for (ant_offset_t i = 0; i < flen; i++) if (fp[i] == 'g') has_g = true;
if (!has_g) return js_mkerr_typed(js, JS_ERR_TYPE, "String.prototype.matchAll called with a non-global RegExp");
}
bool called = false;
ant_value_t call_args[1] = { str };
ant_value_t dispatched = maybe_call_symbol_method(
js, args[0], get_matchAll_sym(), args[0], call_args, 1, &called
);
if (is_err(dispatched)) return dispatched;
if (called) return dispatched;
}
ant_value_t pattern_str = js_tostring_val(js, args[0]);
if (is_err(pattern_str)) return pattern_str;
ant_value_t ctor_args[2] = { pattern_str, js_mkstr(js, "g", 1) };
ant_value_t regexp_ctor = js_get(js, js_glob(js), "RegExp");
ant_value_t rx = sv_vm_call(js->vm, js, regexp_ctor, js_mkundef(), ctor_args, 2, NULL, true);
if (is_err(rx)) return rx;
ant_value_t ma_args[1] = { str };
js->this_val = rx;
return builtin_regexp_symbol_matchAll(js, ma_args, 1);
}
static ant_value_t builtin_regexp_symbol_replace(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@replace] called on non-object");
ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
if (is_err(str)) return str;
ant_value_t replace_value = nargs > 1 ? args[1] : js_mkundef();
bool func_replace = (vtype(replace_value) == T_FUNC || vtype(replace_value) == T_CFUNC);
ant_value_t replace_str = js_mkundef();
if (!func_replace) {
replace_str = js_tostring_val(js, replace_value);
if (is_err(replace_str)) return replace_str;
}
ant_value_t global_val = js_getprop_fallback(js, rx, "global");
if (is_err(global_val)) return global_val;
bool global = js_truthy(js, global_val);
bool full_unicode = false;
if (global) {
ant_value_t unicode_val = js_getprop_fallback(js, rx, "unicode");
if (is_err(unicode_val)) return unicode_val;
full_unicode = js_truthy(js, unicode_val);
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
}
ant_value_t results = js_mkarr(js);
if (is_err(results)) return results;
ant_offset_t nresults = 0;
for (;;) {
ant_value_t result = regexp_exec_abstract(js, rx, str);
if (is_err(result)) return result;
if (vtype(result) == T_NULL) break;
js_arr_push(js, results, result);
nresults++;
if (!global) break;
ant_value_t match_str = js_tostring_val(js, js_arr_get(js, result, 0));
if (is_err(match_str)) return match_str;
ant_offset_t mlen; vstr(js, match_str, &mlen);
if (mlen == 0) {
ant_value_t li_val = js_getprop_fallback(js, rx, "lastIndex");
if (is_err(li_val)) return li_val;
double li = vtype(li_val) == T_NUM ? tod(li_val) : 0;
ant_offset_t sl, so = vstr(js, str, &sl);
double advance = 1;
if (full_unicode && li < (double)sl) {
advance = (double)utf8_char_len_at((const char *)(uintptr_t)(so), sl, (ant_offset_t)li);
}
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(li + advance));
}
}
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
size_t buf_cap = str_len + 256;
char *buf = ant_calloc(buf_cap);
if (!buf) return js_mkerr(js, "oom");
size_t buf_len = 0;
ant_offset_t next_src_pos = 0;
#define SB_APPEND(data, dlen) do { \
if (buf_len + (dlen) >= buf_cap) { \
buf_cap = (buf_len + (dlen) + 1) * 2; \
char *nb = ant_realloc(buf, buf_cap); \
if (!nb) { free(buf); return js_mkerr(js, "oom"); } \
buf = nb; \
} \
memcpy(buf + buf_len, data, dlen); buf_len += (dlen); \
} while(0)
for (ant_offset_t i = 0; i < nresults; i++) {
ant_value_t result = js_arr_get(js, results, i);
ant_value_t matched = js_tostring_val(js, js_arr_get(js, result, 0));
if (is_err(matched)) { free(buf); return matched; }
ant_offset_t matched_len; vstr(js, matched, &matched_len);
ant_value_t pos_val = js_getprop_fallback(js, result, "index");
ant_offset_t position = 0;
if (!is_err(pos_val) && vtype(pos_val) == T_NUM) {
double d = tod(pos_val);
position = d < 0 ? 0 : (ant_offset_t)d;
}
if (position > str_len) position = str_len;
ant_value_t replacement;
if (func_replace) {
ant_offset_t ncaptures = js_arr_len(js, result);
ant_value_t call_args[32];
int ca = 0;
for (ant_offset_t c = 0; c < ncaptures && ca < 30; c++)
call_args[ca++] = js_arr_get(js, result, c);
call_args[ca++] = tov((double)position);
call_args[ca++] = str;
replacement = sv_vm_call(js->vm, js, replace_value, js_mkundef(), call_args, ca, NULL, false);
} else {
replacement = replace_str;
}
if (is_err(replacement)) { free(buf); return replacement; }
ant_value_t rep_str = js_tostring_val(js, replacement);
if (is_err(rep_str)) { free(buf); return rep_str; }
if (position >= next_src_pos) {
str_off = vstr(js, str, &str_len);
if (position > next_src_pos)
SB_APPEND((const char *)(uintptr_t)(str_off + next_src_pos), position - next_src_pos);
ant_offset_t rep_len, rep_off = vstr(js, rep_str, &rep_len);
if (func_replace) {
SB_APPEND((const char *)(uintptr_t)(rep_off), rep_len);
} else {
ant_offset_t ncap = js_arr_len(js, result);
int num_caps = ncap > 1 ? (int)(ncap - 1) : 0;
repl_capture_t caps_buf[16], *caps = num_caps <= 16 ? caps_buf : ant_calloc(sizeof(repl_capture_t) * (size_t)num_caps);
if (num_caps > 16 && !caps) {
free(buf);
return js_mkerr(js, "oom");
}
for (int ci = 0; ci < num_caps; ci++) {
ant_value_t cap = js_arr_get(js, result, (ant_offset_t)(ci + 1));
if (vtype(cap) == T_STR) { ant_offset_t cl, co = vstr(js, cap, &cl); caps[ci] = (repl_capture_t){ (const char *)(uintptr_t)(co), cl }; }
else caps[ci] = (repl_capture_t){ NULL, 0 };
}
ant_offset_t mlen, moff = vstr(js, matched, &mlen);
str_off = vstr(js, str, &str_len);
bool ok = repl_template(
(const char *)(uintptr_t)(rep_off), rep_len,
(const char *)(uintptr_t)(moff), mlen,
(const char *)(uintptr_t)(str_off), str_len, position,
caps, num_caps, &buf, &buf_len, &buf_cap
);
if (caps != caps_buf) free(caps);
if (!ok) {
free(buf);
return js_mkerr(js, "oom");
}
}
next_src_pos = position + matched_len;
}
}
str_off = vstr(js, str, &str_len);
if (next_src_pos < str_len)
SB_APPEND((const char *)(uintptr_t)(str_off + next_src_pos), str_len - next_src_pos);
#undef SB_APPEND
ant_value_t ret = js_mkstr(js, buf, buf_len);
free(buf);
return ret;
}
static ant_value_t builtin_regexp_symbol_search(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js->this_val;
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@search] called on non-object");
ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "undefined", 9);
if (is_err(str)) return str;
ant_value_t prev_li = js_getprop_fallback(js, rx, "lastIndex");
if (is_err(prev_li)) return prev_li;
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), tov(0));
ant_value_t result = regexp_exec_abstract(js, rx, str);
if (is_err(result)) return result;
ant_value_t cur_li = js_getprop_fallback(js, rx, "lastIndex");
if (is_err(cur_li)) return cur_li;
js_setprop(js, rx, js_mkstr(js, "lastIndex", 9), prev_li);
if (vtype(result) == T_NULL) return tov(-1);
ant_value_t idx = js_getprop_fallback(js, result, "index");
if (is_err(idx)) return idx;
return vtype(idx) == T_NUM ? idx : tov(-1);
}
static ant_value_t builtin_regexp_symbol_split(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t rx = js_getthis(js);
if (!is_object_type(rx))
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@split] called on non-object");
ant_value_t str = nargs > 0 ? js_tostring_val(js, args[0]) : js_mkstr(js, "", 0);
if (is_err(str)) return str;
ant_value_t ctor = js_get(js, rx, "constructor");
if (is_err(ctor)) return ctor;
ant_value_t C;
if (vtype(ctor) == T_UNDEF) {
C = js_get(js, js_glob(js), "RegExp");
} else if (!is_object_type(ctor)) {
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp.prototype[@@split]: constructor is not an object");
} else {
ant_value_t species = get_ctor_species_value(js, ctor);
if (is_err(species)) return species;
if (vtype(species) == T_UNDEF || vtype(species) == T_NULL)
C = js_get(js, js_glob(js), "RegExp");
else C = species;
}
if (is_err(C)) return C;
if (vtype(C) != T_FUNC && vtype(C) != T_CFUNC)
return js_mkerr_typed(js, JS_ERR_TYPE, "RegExp species is not a constructor");
ant_value_t flags_val = js_get(js, rx, "flags");
if (is_err(flags_val)) return flags_val;
ant_value_t flags_str = js_tostring_val(js, flags_val);
if (is_err(flags_str)) return flags_str;
ant_offset_t flen, foff = vstr(js, flags_str, &flen);
const char *fptr = (const char *)(uintptr_t)(foff);
bool unicode_matching = false, has_sticky = false;
for (ant_offset_t i = 0; i < flen; i++) {
if (fptr[i] == 'u' || fptr[i] == 'v') unicode_matching = true;
if (fptr[i] == 'y') has_sticky = true;
}
ant_value_t new_flags;
if (has_sticky) new_flags = flags_str; else {
char fbuf[16];
if (flen > 14) flen = 14;
foff = vstr(js, flags_str, &flen);
fptr = (const char *)(uintptr_t)(foff);
memcpy(fbuf, fptr, flen);
fbuf[flen] = 'y';
new_flags = js_mkstr(js, fbuf, flen + 1);
}
ant_value_t ctor_args[2] = { rx, new_flags };
ant_value_t splitter = regexp_species_construct(js, rx, C, ctor_args, 2);
if (is_err(splitter)) return splitter;
ant_value_t A = js_mkarr(js);
if (is_err(A)) return A;
ant_offset_t lengthA = 0;
uint32_t lim = UINT32_MAX;
if (nargs >= 2 && vtype(args[1]) != T_UNDEF) {
double d = tod(args[1]);
if (d >= 0 && d <= UINT32_MAX) lim = (uint32_t)d;
} if (lim == 0) return mkval(T_ARR, vdata(A));
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
ant_offset_t size = str_len;
if (size == 0) {
ant_value_t z = regexp_exec_abstract(js, splitter, str);
if (is_err(z)) return z;
if (vtype(z) == T_NULL) js_arr_push(js, A, str);
return mkval(T_ARR, vdata(A));
}
ant_offset_t p = 0, q = p;
ant_value_t lastIndex_key = js_mkstr(js, "lastIndex", 9);
while (q < size) {
js_setprop(js, splitter, lastIndex_key, tov((double)q));
ant_value_t z = regexp_exec_abstract(js, splitter, str);
if (is_err(z)) return z;
if (vtype(z) == T_NULL) {
if (unicode_matching) {
str_off = vstr(js, str, &str_len);
q += utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, q);
} else q++;
continue;
}
ant_value_t li_val = js_get(js, splitter, "lastIndex");
if (is_err(li_val)) return li_val;
double e_raw = vtype(li_val) == T_NUM ? tod(li_val) : 0;
ant_offset_t e = (ant_offset_t)(e_raw < 0 ? 0 : (e_raw > (double)size ? (double)size : e_raw));
if (e == p) {
if (unicode_matching) {
str_off = vstr(js, str, &str_len);
q += utf8_char_len_at((const char *)(uintptr_t)(str_off), str_len, q);
} else q++;
continue;
}
str_off = vstr(js, str, NULL);
ant_value_t T_val = js_mkstr(js, (char *)(uintptr_t)(str_off + p), q - p);
js_arr_push(js, A, T_val);
lengthA++;
if (lengthA == lim) return mkval(T_ARR, vdata(A));
ant_offset_t num_caps = js_arr_len(js, z);
for (ant_offset_t i = 1; i < num_caps; i++) {
ant_value_t cap = js_arr_get(js, z, i);
js_arr_push(js, A, cap);
lengthA++;
if (lengthA == lim) return mkval(T_ARR, vdata(A));
}
p = e;
q = p;
}
str_off = vstr(js, str, &str_len);
ant_value_t trailing = js_mkstr(js, (char *)(uintptr_t)(str_off + p), str_len - p);
js_arr_push(js, A, trailing);
return mkval(T_ARR, vdata(A));
}
ant_value_t do_regex_match_pcre2(ant_t *js, regex_match_args_t args) {
char pcre2_pattern[4096];
size_t pcre2_len = js_to_pcre2_pattern(args.pattern_ptr, args.pattern_len, pcre2_pattern, sizeof(pcre2_pattern), false);
uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
if (args.ignore_case) options |= PCRE2_CASELESS;
if (args.multiline) options |= PCRE2_MULTILINE;
int errcode;
PCRE2_SIZE erroffset;
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
if (re == NULL) return js_mknull();
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
uint32_t capture_count;
pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
ant_value_t result_arr = js_mkarr(js);
if (is_err(result_arr)) {
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return result_arr;
}
PCRE2_SIZE pos = 0;
int match_count = 0;
while (pos <= (PCRE2_SIZE)args.str_len) {
int rc = pcre2_match(re, (PCRE2_SPTR)args.str_ptr, args.str_len, pos, 0, match_data, NULL);
if (rc < 0) break;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
PCRE2_SIZE match_start = ovector[0];
PCRE2_SIZE match_end = ovector[1];
if (args.global) {
ant_value_t match_str = js_mkstr(js, args.str_ptr + match_start, match_end - match_start);
if (is_err(match_str)) {
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return match_str;
}
js_arr_push(js, result_arr, match_str);
} else {
for (uint32_t i = 0; i <= capture_count; i++) {
PCRE2_SIZE start = ovector[2*i];
PCRE2_SIZE end = ovector[2*i+1];
if (start == PCRE2_UNSET) {
js_arr_push(js, result_arr, js_mkundef());
} else {
ant_value_t match_str = js_mkstr(js, args.str_ptr + start, end - start);
if (is_err(match_str)) {
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return match_str;
}
js_arr_push(js, result_arr, match_str);
}
}
js_setprop(js, result_arr, js_mkstr(js, "index", 5), tov((double)match_start));
}
match_count++;
if (!args.global) break;
if (match_start == match_end) {
pos = match_end + 1;
} else { pos = match_end; }
}
pcre2_match_data_free(match_data);
pcre2_code_free(re);
if (match_count == 0) return js_mknull();
return result_arr;
}
static bool str_buf_append(char **buf, size_t *len, size_t *cap, const char *data, size_t n) {
if (n == 0) return true;
if (*len + n >= *cap) {
size_t new_cap = (*len + n + 1) * 2;
char *nb = (char *)ant_realloc(*buf, new_cap);
if (!nb) return false;
*buf = nb;
*cap = new_cap;
}
memcpy(*buf + *len, data, n);
*len += n;
return true;
}
static inline ant_value_t emit_str_replacement(
ant_t *js, ant_value_t replacement, bool is_func,
const char *repl_ptr, ant_offset_t repl_len,
const char *str_ptr, ant_value_t str,
ant_offset_t pos, ant_offset_t match_len,
char **buf, size_t *buf_len, size_t *buf_cap
) {
if (is_func) {
ant_value_t cb_args[3] = { js_mkstr(js, str_ptr + pos, match_len), tov((double)pos), str };
ant_value_t r = sv_vm_call(js->vm, js, replacement, js_mkundef(), cb_args, 3, NULL, false);
if (vtype(r) == T_ERR) return r;
ant_value_t r_str = js_tostring_val(js, r);
if (is_err(r_str)) return r_str;
ant_offset_t rlen, roff = vstr(js, r_str, &rlen);
if (!str_buf_append(buf, buf_len, buf_cap, (const char *)(uintptr_t)roff, rlen)) return js_mkerr(js, "oom");
} else if (!str_buf_append(buf, buf_len, buf_cap, repl_ptr, repl_len)) return js_mkerr(js, "oom");
return js_mkundef();
}
static ant_value_t string_replace_impl(ant_t *js, ant_value_t *args, int nargs, bool replace_all) {
ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
ant_value_t str = js_tostring_val(js, this_unwrapped);
if (is_err(str)) return str;
if (nargs < 1) return str;
if (is_object_type(args[0])) {
if (replace_all) {
ant_value_t global_val = js_getprop_fallback(js, args[0], "global");
if (!js_truthy(js, global_val)) return js_mkerr_typed(js, JS_ERR_TYPE, "String.prototype.replaceAll called with a non-global RegExp");
}
bool called = false;
ant_value_t replacement_arg = nargs > 1 ? args[1] : js_mkundef();
ant_value_t call_args[2] = { str, replacement_arg };
ant_value_t result = maybe_call_symbol_method(js, args[0], get_replace_sym(), args[0], call_args, 2, &called);
if (is_err(result)) return result;
if (called) return result;
}
if (nargs < 2) return str;
ant_value_t search = args[0];
ant_value_t replacement = args[1];
if (vtype(search) != T_STR) return str;
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
const char *str_ptr = (char *)(uintptr_t)(str_off);
ant_offset_t search_len, search_off = vstr(js, search, &search_len);
const char *search_ptr = (char *)(uintptr_t)(search_off);
bool is_func = (vtype(replacement) == T_FUNC);
ant_offset_t repl_len = 0;
const char *repl_ptr = NULL;
if (!is_func) {
if (vtype(replacement) != T_STR) return str;
ant_offset_t repl_off = vstr(js, replacement, &repl_len);
repl_ptr = (char *)(uintptr_t)(repl_off);
}
if (!replace_all) {
if (search_len > str_len) return str;
ant_offset_t match_pos = 0;
bool found = false;
for (ant_offset_t i = 0; i <= str_len - search_len; i++)
if (memcmp(str_ptr + i, search_ptr, search_len) == 0) {
match_pos = i; found = true; break;
}
if (!found) return str;
size_t cap = str_len + repl_len + 256, len = 0;
char *buf = (char *)ant_calloc(cap);
if (!buf) return js_mkerr(js, "oom");
if (!str_buf_append(&buf, &len, &cap, str_ptr, match_pos)) {
free(buf);
return js_mkerr(js, "oom");
}
ant_value_t err = emit_str_replacement(
js, replacement, is_func, repl_ptr,
repl_len, str_ptr, str, match_pos,
search_len, &buf, &len, &cap
);
if (vtype(err) == T_ERR) {
free(buf);
return err;
}
if (!str_buf_append(
&buf, &len, &cap, str_ptr + match_pos + search_len,
str_len - match_pos - search_len)
) {
free(buf);
return js_mkerr(js, "oom");
}
ant_value_t ret = js_mkstr(js, buf, len);
free(buf);
return ret;
} else {
size_t cap = str_len + repl_len + 256, len = 0;
char *buf = (char *)ant_calloc(cap);
if (!buf) return js_mkerr(js, "oom");
ant_offset_t pos = 0;
bool replaced = false;
while (pos + (ant_offset_t)search_len <= str_len) {
if (search_len == 0 || memcmp(str_ptr + pos, search_ptr, search_len) == 0) {
replaced = true;
ant_value_t err = emit_str_replacement(js, replacement, is_func, repl_ptr, repl_len, str_ptr, str, pos, search_len, &buf, &len, &cap);
if (vtype(err) == T_ERR) { free(buf); return err; }
if (search_len == 0) {
if (pos < str_len && !str_buf_append(&buf, &len, &cap, str_ptr + pos, 1)) { free(buf); return js_mkerr(js, "oom"); }
pos++;
} else pos += search_len;
} else {
if (!str_buf_append(&buf, &len, &cap, str_ptr + pos, 1)) { free(buf); return js_mkerr(js, "oom"); }
pos++;
}
}
if (!str_buf_append(
&buf, &len, &cap, str_ptr + pos,
str_len - pos)
) {
free(buf);
return js_mkerr(js, "oom");
}
if (!replaced) {
free(buf);
return str;
}
ant_value_t ret = js_mkstr(js, buf, len);
free(buf);
return ret;
}
}
static ant_value_t builtin_string_replace(ant_t *js, ant_value_t *args, int nargs) {
return string_replace_impl(js, args, nargs, false);
}
static ant_value_t builtin_string_replaceAll(ant_t *js, ant_value_t *args, int nargs) {
return string_replace_impl(js, args, nargs, true);
}
static ant_value_t builtin_string_search(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
ant_value_t str = js_tostring_val(js, this_unwrapped);
if (is_err(str)) return str;
if (nargs < 1) return tov(-1);
if (is_object_type(args[0])) {
bool called = false;
ant_value_t call_args[1] = { str };
ant_value_t dispatched = maybe_call_symbol_method(
js, args[0], get_search_sym(), args[0], call_args, 1, &called
);
if (is_err(dispatched)) return dispatched;
if (called) return dispatched;
}
ant_value_t pattern = args[0];
const char *pattern_ptr = NULL;
ant_offset_t pattern_len = 0;
bool ignore_case = false, multiline = false;
if (vtype(pattern) == T_OBJ) {
ant_offset_t source_off = lkp(js, pattern, "source", 6);
if (source_off == 0) return tov(-1);
ant_value_t source_val = js_propref_load(js, source_off);
if (vtype(source_val) != T_STR) return tov(-1);
ant_offset_t poff;
poff = vstr(js, source_val, &pattern_len);
pattern_ptr = (char *)(uintptr_t)(poff);
ant_offset_t flags_off = lkp(js, pattern, "flags", 5);
if (flags_off != 0) {
ant_value_t flags_val = js_propref_load(js, flags_off);
if (vtype(flags_val) == T_STR) {
ant_offset_t flen, foff = vstr(js, flags_val, &flen);
const char *flags_str = (char *)(uintptr_t)(foff);
for (ant_offset_t i = 0; i < flen; i++) {
if (flags_str[i] == 'i') ignore_case = true;
if (flags_str[i] == 'm') multiline = true;
}
}
}
} else if (vtype(pattern) == T_STR) {
ant_offset_t poff;
poff = vstr(js, pattern, &pattern_len);
pattern_ptr = (char *)(uintptr_t)(poff);
} else {
return tov(-1);
}
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
const char *str_ptr = (char *)(uintptr_t)(str_off);
char pcre2_pattern[4096];
size_t pcre2_len = js_to_pcre2_pattern(pattern_ptr, pattern_len, pcre2_pattern, sizeof(pcre2_pattern), false);
uint32_t options = PCRE2_UTF | PCRE2_UCP | PCRE2_MATCH_UNSET_BACKREF | PCRE2_DUPNAMES;
if (ignore_case) options |= PCRE2_CASELESS;
if (multiline) options |= PCRE2_MULTILINE;
int errcode;
PCRE2_SIZE erroffset;
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pcre2_pattern, pcre2_len, options, &errcode, &erroffset, NULL);
if (re == NULL) return tov(-1);
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, NULL);
int rc = pcre2_match(re, (PCRE2_SPTR)str_ptr, str_len, 0, 0, match_data, NULL);
if (rc < 0) {
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return tov(-1);
}
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
double result = (double)ovector[0];
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return tov(result);
}
static ant_value_t builtin_string_match(ant_t *js, ant_value_t *args, int nargs) {
ant_value_t this_unwrapped = unwrap_primitive(js, js->this_val);
ant_value_t str = js_tostring_val(js, this_unwrapped);
if (is_err(str)) return str;
if (nargs < 1) return js_mknull();
if (is_object_type(args[0])) {
bool called = false;
ant_value_t call_args[1] = { str };
ant_value_t dispatched = maybe_call_symbol_method(
js, args[0], get_match_sym(), args[0], call_args, 1, &called
);
if (is_err(dispatched)) return dispatched;
if (called) return dispatched;
}
ant_value_t pattern = args[0];
const char *pattern_ptr = NULL;
ant_offset_t pattern_len = 0;
bool global_flag = false;
bool ignore_case = false;
bool multiline = false;
if (vtype(pattern) == T_OBJ) {
ant_offset_t source_off = lkp(js, pattern, "source", 6);
if (source_off == 0) return js_mknull();
ant_value_t source_val = js_propref_load(js, source_off);
if (vtype(source_val) != T_STR) return js_mknull();
ant_offset_t poff;
poff = vstr(js, source_val, &pattern_len);
pattern_ptr = (char *)(uintptr_t)(poff);
ant_offset_t flags_off = lkp(js, pattern, "flags", 5);
if (flags_off != 0) {
ant_value_t flags_val = js_propref_load(js, flags_off);
if (vtype(flags_val) == T_STR) {
ant_offset_t flen, foff = vstr(js, flags_val, &flen);
const char *flags_str = (char *)(uintptr_t)(foff);
for (ant_offset_t i = 0; i < flen; i++) {
if (flags_str[i] == 'g') global_flag = true;
if (flags_str[i] == 'i') ignore_case = true;
if (flags_str[i] == 'm') multiline = true;
}}
}
} else if (vtype(pattern) == T_STR) {
ant_offset_t poff;
poff = vstr(js, pattern, &pattern_len);
pattern_ptr = (char *)(uintptr_t)(poff);
} else return js_mknull();
ant_offset_t str_len, str_off = vstr(js, str, &str_len);
const char *str_ptr = (char *)(uintptr_t)(str_off);
ant_value_t result = do_regex_match_pcre2(js, (regex_match_args_t){
.pattern_ptr = pattern_ptr, .pattern_len = pattern_len,
.str_ptr = str_ptr, .str_len = str_len,
.global = global_flag, .ignore_case = ignore_case, .multiline = multiline,
});
if (!global_flag && vtype(result) == T_ARR) {
js_setprop(js, result, js_mkstr(js, "input", 5), str);
}
return result;
}
void init_regex_module(void) {
ant_t *js = rt->js;
ant_value_t glob = js->global;
ant_value_t object_proto = js->sym.object_proto;
ant_value_t regexp_proto = js_mkobj(js);
js_set_proto_init(regexp_proto, object_proto);
defmethod(js, regexp_proto, "test", 4, js_mkfun(builtin_regexp_test));
defmethod(js, regexp_proto, "exec", 4, js_mkfun(builtin_regexp_exec));
defmethod(js, regexp_proto, "toString", 8, js_mkfun(builtin_regexp_toString));
js_mkprop_fast(js, regexp_proto, "global", 6, js_false);
js_mkprop_fast(js, regexp_proto, "ignoreCase", 10, js_false);
js_mkprop_fast(js, regexp_proto, "multiline", 9, js_false);
js_mkprop_fast(js, regexp_proto, "dotAll", 6, js_false);
js_mkprop_fast(js, regexp_proto, "unicode", 7, js_false);
js_mkprop_fast(js, regexp_proto, "sticky", 6, js_false);
js_mkprop_fast(js, regexp_proto, "hasIndices", 10, js_false);
js_mkprop_fast(js, regexp_proto, "unicodeSets", 11, js_false);
js_set_sym(js, regexp_proto, get_split_sym(), js_mkfun(builtin_regexp_symbol_split));
js_set_sym(js, regexp_proto, get_match_sym(), js_mkfun(builtin_regexp_symbol_match));
js_set_sym(js, regexp_proto, get_matchAll_sym(), js_mkfun(builtin_regexp_symbol_matchAll));
regexp_matchall_iter_proto_val = js_mkobj(js);
js_set_proto_init(regexp_matchall_iter_proto_val, js->sym.iterator_proto);
defmethod(js, regexp_matchall_iter_proto_val, "next", 4, js_mkfun(regexp_matchall_next));
js_set_sym(js, regexp_matchall_iter_proto_val, get_iterator_sym(), js_mkfun(sym_this_cb));
js_set_sym(js, regexp_proto, get_replace_sym(), js_mkfun(builtin_regexp_symbol_replace));
js_set_sym(js, regexp_proto, get_search_sym(), js_mkfun(builtin_regexp_symbol_search));
js_set_sym(js, regexp_proto, get_toStringTag_sym(), js_mkstr(js, "RegExp", 6));
js_set_getter_desc(js, regexp_proto, "flags", 5, js_mkfun(builtin_regexp_flags_getter), JS_DESC_C);
defmethod(js, regexp_proto, "compile", 7, js_mkfun(builtin_regexp_compile));
ant_value_t regexp_ctor = js_mkobj(js);
js_set_slot(regexp_ctor, SLOT_CFUNC, js_mkfun(builtin_RegExp));
js_mkprop_fast(js, regexp_ctor, "prototype", 9, regexp_proto);
js_mkprop_fast(js, regexp_ctor, "name", 4, js_mkstr(js, "RegExp", 6));
js_set_descriptor(js, regexp_ctor, "name", 4, 0);
js_define_species_getter(js, regexp_ctor);
ant_value_t regexp_func = js_obj_to_func(regexp_ctor);
js_setprop(js, regexp_proto, js_mkstr(js, "constructor", 11), regexp_func);
js_set_descriptor(js, regexp_proto, "constructor", 11, JS_DESC_W | JS_DESC_C);
js_set(js, regexp_ctor, "escape", js_mkfun(builtin_regexp_escape));
ant_value_t empty = js_mkstr(js, "", 0);
for (int i = 1; i <= 9; i++) {
char key[3] = {'$', (char)('0' + i), '\0'};
js_set(js, regexp_ctor, key, empty);
}
js_set(js, regexp_ctor, "lastMatch", empty);
js_set(js, regexp_ctor, "$&", empty);
js_set(js, glob, "RegExp", regexp_func);
ant_value_t string_ctor = js_get(js, glob, "String");
ant_value_t string_proto = js_get(js, string_ctor, "prototype");
defmethod(js, string_proto, "search", 6, js_mkfun(builtin_string_search));
defmethod(js, string_proto, "match", 5, js_mkfun(builtin_string_match));
defmethod(js, string_proto, "matchAll", 8, js_mkfun(builtin_string_matchAll));
defmethod(js, string_proto, "replace", 7, js_mkfun(builtin_string_replace));
defmethod(js, string_proto, "replaceAll", 10, js_mkfun(builtin_string_replaceAll));
}
void gc_sweep_regex_cache(void) {
size_t write = 0;
for (size_t i = 0; i < regex_cache_count; i++) {
if (!gc_obj_is_marked(regex_cache[i].obj)) {
pcre2_match_data_free(regex_cache[i].match_data);
pcre2_code_free(regex_cache[i].code);
} else {
if (write != i) regex_cache[write] = regex_cache[i];
write++;
}
}
regex_cache_count = write;
}
void cleanup_regex_module(void) {
for (size_t i = 0; i < regex_cache_count; i++) {
pcre2_match_data_free(regex_cache[i].match_data);
pcre2_code_free(regex_cache[i].code);
}
free(regex_cache);
regex_cache = NULL;
regex_cache_count = 0;
regex_cache_cap = 0;
}

File Metadata

Mime Type
text/x-c
Expires
Sat, May 2, 6:15 AM (1 d, 23 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
541769
Default Alt Text
regex.c (76 KB)

Event Timeline