Page MenuHomePhorge

utf8.c
No OneTemporary

Size
5 KB
Referenced Files
None
Subscribers
None
#include "utf8.h"
#include <string.h>
static uint32_t utf8_decode(const unsigned char *buf, size_t len, int *seq_len) {
if (len == 0) { *seq_len = 0; return 0; }
utf8proc_int32_t cp;
*seq_len = (int)utf8_next(buf, (utf8proc_ssize_t)len, &cp);
return cp < 0 ? 0xFFFD : (uint32_t)cp;
}
size_t utf8_char_len_at(const char *str, size_t byte_len, size_t pos) {
if (pos >= byte_len) return 1;
int seq = utf8_sequence_length((unsigned char)str[pos]);
if (seq <= 0) return 1;
if (pos + (size_t)seq > byte_len) return byte_len - pos;
return (size_t)seq;
}
size_t utf8_strlen(const char *str, size_t byte_len) {
size_t count = 0;
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
while (p < end) {
int seq_len = utf8_sequence_length(*p);
if (seq_len <= 0 || (size_t)seq_len > (size_t)(end - p)) {
count++; p++;
} else { count++; p += seq_len; }
}
return count;
}
size_t utf16_strlen(const char *str, size_t byte_len) {
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
size_t i = 0;
for (; i + 8 <= byte_len; i += 8) {
uint64_t chunk;
memcpy(&chunk, p + i, 8);
if (chunk & 0x8080808080808080ULL) goto slow_path;
}
for (; i < byte_len; i++) {
if (p[i] & 0x80) goto slow_path;
}
return byte_len;
slow_path:;
size_t count = i;
p += i;
while (p < end) {
unsigned char c = *p;
if ((c & 0xC0) != 0x80) {
count++;
if ((c & 0xF8) == 0xF0) count++;
}
p++;
}
return count;
}
int utf16_index_to_byte_offset(
const char *str,
size_t byte_len,
size_t utf16_idx,
size_t *out_char_bytes
) {
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
size_t utf16_pos = 0;
while (p < end && utf16_pos < utf16_idx) {
unsigned char c = *p;
if (c < 0x80) { p++; utf16_pos++; }
else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; }
else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; }
else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; }
else { p++; utf16_pos++; }
if (p > end) p = end;
}
if (p >= end) {
if (utf16_pos == utf16_idx) {
if (out_char_bytes) *out_char_bytes = 0;
return (int)byte_len;
} return -1;
}
unsigned char c = *p;
size_t slen = (c < 0x80)
? 1 : ((c & 0xE0) == 0xC0)
? 2 : ((c & 0xF0) == 0xE0)
? 3 : ((c & 0xF8) == 0xF0)
? 4 : 1;
if (out_char_bytes) *out_char_bytes = slen;
return (int)(p - (const unsigned char *)str);
}
int utf16_range_to_byte_range(
const char *str,
size_t byte_len,
size_t utf16_start,
size_t utf16_end,
size_t *byte_start,
size_t *byte_end
) {
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
size_t utf16_pos = 0;
size_t b_start = 0, b_end = byte_len;
int found_start = 0, found_end = 0;
while (p < end) {
if (utf16_pos == utf16_start) { b_start = p - (const unsigned char *)str; found_start = 1; }
if (utf16_pos == utf16_end) { b_end = p - (const unsigned char *)str; found_end = 1; break; }
unsigned char c = *p;
if (c < 0x80) { p++; utf16_pos++; }
else if ((c & 0xE0) == 0xC0) { p += 2; utf16_pos++; }
else if ((c & 0xF0) == 0xE0) { p += 3; utf16_pos++; }
else if ((c & 0xF8) == 0xF0) { p += 4; utf16_pos += 2; }
else { p++; utf16_pos++; }
if (p > end) p = end;
}
if (!found_start && utf16_start >= utf16_pos) b_start = byte_len;
if (!found_end && utf16_end >= utf16_pos) b_end = byte_len;
*byte_start = b_start;
*byte_end = b_end;
return 0;
}
uint32_t utf16_code_unit_at(const char *str, size_t byte_len, size_t utf16_idx) {
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
size_t utf16_pos = 0;
while (p < end) {
unsigned char c = *p;
size_t units, slen;
uint32_t cp;
if (c < 0x80) { cp = c; slen = 1; units = 1; }
else if ((c & 0xE0) == 0xC0 && p + 1 < end) {
cp = ((c & 0x1F) << 6)
| (p[1] & 0x3F);
slen = 2; units = 1;
}
else if ((c & 0xF0) == 0xE0 && p + 2 < end) {
cp = ((c & 0x0F) << 12)
| ((p[1] & 0x3F) << 6)
| (p[2] & 0x3F);
slen = 3; units = 1;
}
else if ((c & 0xF8) == 0xF0 && p + 3 < end) {
cp = ((c & 0x07) << 18)
| ((p[1] & 0x3F) << 12)
| ((p[2] & 0x3F) << 6)
| (p[3] & 0x3F);
slen = 4; units = 2;
}
else { cp = c; slen = 1; units = 1; }
if (utf16_pos == utf16_idx) {
if (units == 2) return 0xD800 + ((cp - 0x10000) >> 10);
return cp;
}
if (units == 2 && utf16_pos + 1 == utf16_idx) {
return 0xDC00 + ((cp - 0x10000) & 0x3FF);
}
p += slen;
utf16_pos += units;
}
return 0xFFFFFFFF;
}
uint32_t utf16_codepoint_at(const char *str, size_t byte_len, size_t utf16_idx) {
const unsigned char *p = (const unsigned char *)str;
const unsigned char *end = p + byte_len;
size_t utf16_pos = 0;
while (p < end) {
unsigned char c = *p;
size_t units, slen;
uint32_t cp;
if (c < 0x80) { cp = c; slen = 1; units = 1; }
else if ((c & 0xE0) == 0xC0 && p + 1 < end) {
cp = ((c & 0x1F) << 6)
| (p[1] & 0x3F);
slen = 2; units = 1;
}
else if ((c & 0xF0) == 0xE0 && p + 2 < end) {
cp = ((c & 0x0F) << 12)
| ((p[1] & 0x3F) << 6)
| (p[2] & 0x3F);
slen = 3; units = 1;
}
else if ((c & 0xF8) == 0xF0 && p + 3 < end) {
cp = ((c & 0x07) << 18)
| ((p[1] & 0x3F) << 12)
| ((p[2] & 0x3F) << 6)
| (p[3] & 0x3F);
slen = 4; units = 2;
}
else { cp = c; slen = 1; units = 1; }
if (utf16_pos == utf16_idx) return cp;
if (units == 2 && utf16_pos + 1 == utf16_idx) {
return 0xDC00 + ((cp - 0x10000) & 0x3FF);
}
p += slen;
utf16_pos += units;
}
return 0xFFFFFFFF;
}

File Metadata

Mime Type
text/x-c
Expires
Thu, Mar 26, 4:46 PM (1 d, 23 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
511639
Default Alt Text
utf8.c (5 KB)

Event Timeline