mirror of https://github.com/php/php-src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1039 lines
27 KiB
1039 lines
27 KiB
/*
|
|
* Copyright (C) 2023 Alexander Borisov
|
|
*
|
|
* Author: Alexander Borisov <borisov@lexbor.com>
|
|
*/
|
|
|
|
#include <math.h>
|
|
#include <inttypes.h>
|
|
|
|
#include "lexbor/unicode/unicode.h"
|
|
#include "lexbor/unicode/res.h"
|
|
#include "lexbor/encoding/encoding.h"
|
|
|
|
|
|
typedef union {
|
|
lexbor_serialize_cb_cp_f cp_cb;
|
|
lexbor_serialize_cb_f cb;
|
|
}
|
|
lxb_unicode_callback_u;
|
|
|
|
|
|
/* Hangul syllables for modern Korean. */
|
|
static const lxb_codepoint_t lxb_unicode_sb = 0xAC00;
|
|
static const lxb_codepoint_t lxb_unicode_sl = 0xD7A3;
|
|
/* Hangul vowels (syllable nucleuses). */
|
|
static const lxb_codepoint_t lxb_unicode_lb = 0x1100;
|
|
static const lxb_codepoint_t lxb_unicode_ll = 0x1112;
|
|
/* Hangul vowels (syllable nucleuses). */
|
|
static const lxb_codepoint_t lxb_unicode_vb = 0x1161;
|
|
static const lxb_codepoint_t lxb_unicode_vl = 0x1175;
|
|
/* Hangul trailing consonants (syllable codas). */
|
|
static const lxb_codepoint_t lxb_unicode_tb = 0x11A8;
|
|
static const lxb_codepoint_t lxb_unicode_tl = 0x11C2;
|
|
static const lxb_codepoint_t lxb_unicode_ts = 0x11A7;
|
|
static const lxb_codepoint_t lxb_unicode_vc = 0x15;
|
|
static const lxb_codepoint_t lxb_unicode_tc = 0x1C;
|
|
|
|
|
|
static lxb_status_t
|
|
lxb_unicode_normalize_body(lxb_unicode_normalizer_t *uc, const void *data,
|
|
size_t length, lxb_unicode_callback_u *cb, void *ctx,
|
|
bool is_last, bool is_cp);
|
|
|
|
static void
|
|
lxb_unicode_canonical(lxb_unicode_buffer_t *starter, lxb_unicode_buffer_t *op,
|
|
lxb_unicode_buffer_t *p);
|
|
|
|
static void
|
|
lxb_unicode_compatibility(lxb_unicode_buffer_t *starter,
|
|
lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p);
|
|
|
|
static void
|
|
lxb_unicode_canonical_composition(lxb_unicode_buffer_t *p,
|
|
const lxb_unicode_buffer_t *end);
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_canonical_decomposition(lxb_unicode_normalizer_t *uc,
|
|
lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end);
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_compatibility_decomposition(lxb_unicode_normalizer_t *uc,
|
|
lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end);
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_decomposition(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end,
|
|
lxb_unicode_decomposition_type_t type);
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_entry_decomposition_hangul(lxb_unicode_normalizer_t *uc,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end,
|
|
lxb_codepoint_t cp);
|
|
|
|
static lxb_codepoint_t
|
|
lxb_unicode_entry_compose_hangul(lxb_codepoint_t first, lxb_codepoint_t second);
|
|
|
|
|
|
lxb_unicode_normalizer_t *
|
|
lxb_unicode_normalizer_create(void)
|
|
{
|
|
return lexbor_malloc(sizeof(lxb_unicode_normalizer_t));
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalizer_init(lxb_unicode_normalizer_t *uc,
|
|
lxb_unicode_form_t form)
|
|
{
|
|
lxb_status_t status;
|
|
static const size_t buf_length = 4096;
|
|
|
|
if (uc == NULL) {
|
|
return LXB_STATUS_ERROR_OBJECT_IS_NULL;
|
|
}
|
|
|
|
status = lxb_unicode_normalization_form_set(uc, form);
|
|
if (status != LXB_STATUS_OK) {
|
|
return status;
|
|
}
|
|
|
|
uc->tmp_lenght = 0;
|
|
uc->starter = NULL;
|
|
|
|
uc->buf = lexbor_malloc(buf_length * sizeof(lxb_unicode_buffer_t));
|
|
if (uc->buf == NULL) {
|
|
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
|
}
|
|
|
|
uc->end = uc->buf + buf_length;
|
|
uc->p = uc->buf;
|
|
uc->ican = uc->buf;
|
|
uc->quick_ccc = 0;
|
|
uc->flush_cp = 1024;
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
void
|
|
lxb_unicode_normalizer_clean(lxb_unicode_normalizer_t *uc)
|
|
{
|
|
uc->tmp_lenght = 0;
|
|
uc->starter = NULL;
|
|
uc->p = uc->buf;
|
|
uc->ican = uc->buf;
|
|
uc->quick_ccc = 0;
|
|
}
|
|
|
|
lxb_unicode_normalizer_t *
|
|
lxb_unicode_normalizer_destroy(lxb_unicode_normalizer_t *uc, bool self_destroy)
|
|
{
|
|
if (uc == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
if (uc->buf != NULL) {
|
|
uc->buf = lexbor_free(uc->buf);
|
|
}
|
|
|
|
if (self_destroy) {
|
|
return lexbor_free(uc);
|
|
}
|
|
|
|
return uc;
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalization_form_set(lxb_unicode_normalizer_t *uc,
|
|
lxb_unicode_form_t form)
|
|
{
|
|
switch (form) {
|
|
case LXB_UNICODE_NFC:
|
|
uc->decomposition = lxb_unicode_canonical_decomposition;
|
|
uc->composition = lxb_unicode_canonical;
|
|
uc->quick_type = LXB_UNICODE_QUICK_NFC_NO|LXB_UNICODE_QUICK_NFC_MAYBE;
|
|
break;
|
|
|
|
case LXB_UNICODE_NFD:
|
|
uc->decomposition = lxb_unicode_canonical_decomposition;
|
|
uc->composition = lxb_unicode_compatibility;
|
|
uc->quick_type = LXB_UNICODE_QUICK_NFD_NO;
|
|
break;
|
|
|
|
case LXB_UNICODE_NFKC:
|
|
uc->decomposition = lxb_unicode_compatibility_decomposition;
|
|
uc->composition = lxb_unicode_canonical;
|
|
uc->quick_type = LXB_UNICODE_QUICK_NFKC_NO|LXB_UNICODE_QUICK_NFKC_MAYBE;
|
|
break;
|
|
|
|
case LXB_UNICODE_NFKD:
|
|
uc->decomposition = lxb_unicode_compatibility_decomposition;
|
|
uc->composition = lxb_unicode_compatibility;
|
|
uc->quick_type = LXB_UNICODE_QUICK_NFKD_NO;
|
|
break;
|
|
|
|
default:
|
|
return LXB_STATUS_ERROR_WRONG_ARGS;
|
|
}
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_flush(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
|
|
void *ctx)
|
|
{
|
|
int8_t res;
|
|
lxb_char_t *tmp;
|
|
lxb_status_t status;
|
|
lxb_unicode_buffer_t *p, *end;
|
|
lxb_char_t buffer[4096];
|
|
const lxb_char_t *buffer_end = buffer + sizeof(buffer);
|
|
|
|
p = uc->buf;
|
|
end = uc->ican;
|
|
tmp = buffer;
|
|
|
|
while (p < end) {
|
|
if (p->cp != LXB_ENCODING_ERROR_CODEPOINT) {
|
|
res = lxb_encoding_encode_utf_8_single(NULL, &tmp, buffer_end,
|
|
p->cp);
|
|
if (res == LXB_ENCODING_ENCODE_SMALL_BUFFER) {
|
|
status = cb(buffer, tmp - buffer, ctx);
|
|
if (status != LXB_STATUS_OK) {
|
|
return status;
|
|
}
|
|
|
|
tmp = buffer;
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
p += 1;
|
|
}
|
|
|
|
if (tmp != buffer) {
|
|
return cb(buffer, tmp - buffer, ctx);
|
|
}
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_flush_cp(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_cp_f cb,
|
|
void *ctx)
|
|
{
|
|
lxb_status_t status;
|
|
lxb_unicode_buffer_t *p, *end;
|
|
lxb_codepoint_t *tmp;
|
|
lxb_codepoint_t buffer[4096];
|
|
const lxb_codepoint_t *buffer_end;
|
|
|
|
buffer_end = buffer + (sizeof(buffer) / sizeof(lxb_codepoint_t));
|
|
|
|
p = uc->buf;
|
|
end = uc->ican;
|
|
tmp = buffer;
|
|
|
|
while (p < end) {
|
|
if (p->cp != LXB_ENCODING_ERROR_CODEPOINT) {
|
|
*tmp++ = p->cp;
|
|
|
|
if (tmp >= buffer_end) {
|
|
status = cb(buffer, tmp - buffer, ctx);
|
|
if (status != LXB_STATUS_OK) {
|
|
return status;
|
|
}
|
|
|
|
tmp = buffer;
|
|
}
|
|
}
|
|
|
|
p += 1;
|
|
}
|
|
|
|
if (tmp != buffer) {
|
|
return cb(buffer, tmp - buffer, ctx);
|
|
}
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
lxb_inline void
|
|
lxb_unicode_check_buf(lxb_unicode_normalizer_t *uc, lxb_unicode_buffer_t **p,
|
|
const lxb_unicode_buffer_t **end, size_t length)
|
|
{
|
|
size_t len, new_len, starter_len;
|
|
lxb_unicode_buffer_t *buf;
|
|
|
|
static const size_t buf_length = 1024;
|
|
|
|
if (*p + length >= *end) {
|
|
len = *p - uc->buf;
|
|
new_len = (uc->end - uc->buf) + buf_length + length;
|
|
starter_len = (uc->starter != NULL) ? uc->starter - uc->buf : 0;
|
|
|
|
buf = lexbor_realloc(uc->buf, new_len * sizeof(lxb_unicode_buffer_t));
|
|
if (buf == NULL) {
|
|
*p = NULL;
|
|
return;
|
|
}
|
|
|
|
if (uc->starter != NULL) {
|
|
uc->starter = buf + starter_len;
|
|
}
|
|
|
|
uc->buf = buf;
|
|
uc->end = buf + new_len;
|
|
|
|
*p = buf + len;
|
|
*end = uc->end;
|
|
}
|
|
}
|
|
|
|
lxb_inline void
|
|
lxb_unicode_reorder(lxb_unicode_buffer_t *p, lxb_unicode_buffer_t *starter)
|
|
{
|
|
lxb_unicode_buffer_t swap;
|
|
lxb_unicode_buffer_t *end = p;
|
|
|
|
while (p > starter) {
|
|
if (p[-1].ccc > p->ccc) {
|
|
swap = *p;
|
|
|
|
*p = p[-1];
|
|
p[-1] = swap;
|
|
|
|
if (p < end) {
|
|
p += 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
p -= 1;
|
|
}
|
|
}
|
|
|
|
lxb_inline const lxb_char_t *
|
|
lxb_unicode_restore(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
|
const lxb_char_t *end, lxb_codepoint_t *cp, bool is_last)
|
|
{
|
|
size_t i, len;
|
|
lxb_char_t *tmp;
|
|
|
|
tmp = uc->tmp;
|
|
len = uc->tmp_lenght;
|
|
|
|
i = lxb_encoding_decode_utf_8_length(tmp[0]);
|
|
|
|
while (len < i && data < end) {
|
|
tmp[ len++ ] = *data;
|
|
data += 1;
|
|
}
|
|
|
|
*cp = lxb_encoding_decode_valid_utf_8_single((const lxb_char_t **) &tmp,
|
|
tmp + i);
|
|
if (*cp == LXB_ENCODING_DECODE_ERROR) {
|
|
if (!is_last) {
|
|
uc->tmp_lenght = len;
|
|
return NULL;
|
|
}
|
|
|
|
*cp = LXB_ENCODING_REPLACEMENT_CODEPOINT;
|
|
}
|
|
|
|
uc->tmp_lenght = 0;
|
|
|
|
return data;
|
|
}
|
|
|
|
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalize(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
|
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
|
bool is_last)
|
|
{
|
|
lxb_unicode_callback_u cu = {.cb = cb};
|
|
|
|
return lxb_unicode_normalize_body(uc, data, length, &cu, ctx, is_last, false);
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalize_cp(lxb_unicode_normalizer_t *uc, const lxb_codepoint_t *cps,
|
|
size_t length, lexbor_serialize_cb_cp_f cb, void *ctx,
|
|
bool is_last)
|
|
{
|
|
lxb_unicode_callback_u cu = {.cp_cb = cb};
|
|
|
|
return lxb_unicode_normalize_body(uc, cps, length, &cu, ctx, is_last, true);
|
|
}
|
|
|
|
static lxb_status_t
|
|
lxb_unicode_normalize_body(lxb_unicode_normalizer_t *uc, const void *data,
|
|
size_t length, lxb_unicode_callback_u *cb, void *ctx,
|
|
bool is_last, bool is_cp)
|
|
{
|
|
lxb_status_t status;
|
|
lxb_codepoint_t cp;
|
|
const lxb_char_t *end, *tp, *np;
|
|
lxb_unicode_buffer_t *p, *dp, *op, *buf;
|
|
const lxb_unicode_buffer_t *buf_end;
|
|
|
|
buf_end = uc->end;
|
|
p = uc->p;
|
|
|
|
np = data;
|
|
length *= (is_cp) ? sizeof(lxb_codepoint_t) : 1;
|
|
end = (const lxb_char_t *) data + length;
|
|
|
|
if (uc->tmp_lenght != 0 && !is_cp) {
|
|
np = lxb_unicode_restore(uc, np, end, &cp, is_last);
|
|
if (np == NULL) {
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
goto restore;
|
|
}
|
|
|
|
while (np < end) {
|
|
if (!is_cp) {
|
|
tp = np;
|
|
|
|
cp = lxb_encoding_decode_valid_utf_8_single(&np, end);
|
|
if (cp == LXB_ENCODING_DECODE_ERROR) {
|
|
if (np >= end && !is_last) {
|
|
uc->p = p;
|
|
uc->tmp_lenght = end - tp;
|
|
|
|
memcpy(uc->tmp, tp, uc->tmp_lenght);
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
cp = LXB_ENCODING_REPLACEMENT_CODEPOINT;
|
|
}
|
|
}
|
|
else {
|
|
cp = *((const lxb_codepoint_t *) np);
|
|
np = (const lxb_char_t *) ((const lxb_codepoint_t *) np + 1);
|
|
}
|
|
|
|
restore:
|
|
|
|
dp = uc->decomposition(uc, cp, &p, &buf_end);
|
|
if (dp == NULL) {
|
|
lxb_unicode_normalizer_clean(uc);
|
|
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
|
}
|
|
|
|
while (p < dp) {
|
|
if (p->ccc == 0) {
|
|
op = p - 1;
|
|
buf = uc->buf;
|
|
|
|
if (uc->starter == NULL) {
|
|
lxb_unicode_reorder(op, buf);
|
|
|
|
uc->starter = p++;
|
|
continue;
|
|
}
|
|
|
|
uc->composition(uc->starter, op, p + 1);
|
|
|
|
if (p->cp != LXB_ENCODING_ERROR_CODEPOINT) {
|
|
uc->starter = p;
|
|
uc->ican = p;
|
|
|
|
if (p - buf >= uc->flush_cp) {
|
|
if (!is_cp) {
|
|
status = lxb_unicode_flush(uc, cb->cb, ctx);
|
|
}
|
|
else {
|
|
status = lxb_unicode_flush_cp(uc, cb->cp_cb, ctx);
|
|
}
|
|
|
|
if (status != LXB_STATUS_OK) {
|
|
return status;
|
|
}
|
|
|
|
buf->cp = p->cp;
|
|
buf->ccc = p->ccc;
|
|
|
|
dp = buf + (dp - p);
|
|
p = buf;
|
|
|
|
uc->starter = p;
|
|
uc->ican = p;
|
|
}
|
|
}
|
|
}
|
|
|
|
p += 1;
|
|
}
|
|
}
|
|
|
|
status = LXB_STATUS_OK;
|
|
|
|
if (is_last) {
|
|
if (uc->starter != NULL && uc->starter != p - 1) {
|
|
uc->composition(uc->starter, p - 1, p);
|
|
}
|
|
|
|
uc->ican = p;
|
|
|
|
if (!is_cp) {
|
|
status = lxb_unicode_flush(uc, cb->cb, ctx);
|
|
}
|
|
else {
|
|
status = lxb_unicode_flush_cp(uc, cb->cp_cb, ctx);
|
|
}
|
|
|
|
uc->p = uc->buf;
|
|
uc->ican = uc->buf;
|
|
uc->starter = NULL;
|
|
}
|
|
else {
|
|
uc->p = p;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalize_end(lxb_unicode_normalizer_t *uc,
|
|
lexbor_serialize_cb_f cb, void *ctx)
|
|
{
|
|
return lxb_unicode_normalize(uc, NULL, 0, cb, ctx, true);
|
|
}
|
|
|
|
lxb_status_t
|
|
lxb_unicode_normalize_cp_end(lxb_unicode_normalizer_t *uc,
|
|
lexbor_serialize_cb_cp_f cb, void *ctx)
|
|
{
|
|
return lxb_unicode_normalize_cp(uc, NULL, 0, cb, ctx, true);
|
|
}
|
|
|
|
bool
|
|
lxb_unicode_quick_check(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
|
size_t length, bool is_last)
|
|
{
|
|
lxb_codepoint_t cp;
|
|
const lxb_char_t *end, *tp;
|
|
const lxb_unicode_normalization_entry_t *entry;
|
|
|
|
end = data + length;
|
|
|
|
if (uc->tmp_lenght != 0) {
|
|
data = lxb_unicode_restore(uc, data, end, &cp, is_last);
|
|
if (data == NULL) {
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
goto restore;
|
|
}
|
|
|
|
while (data < end) {
|
|
tp = data;
|
|
|
|
cp = lxb_encoding_decode_valid_utf_8_single(&data, end);
|
|
if (cp == LXB_ENCODING_DECODE_ERROR) {
|
|
if (data >= end && !is_last) {
|
|
uc->tmp_lenght = end - tp;
|
|
|
|
memcpy(uc->tmp, tp, uc->tmp_lenght);
|
|
|
|
return LXB_STATUS_OK;
|
|
}
|
|
|
|
cp = LXB_ENCODING_REPLACEMENT_CODEPOINT;
|
|
}
|
|
|
|
restore:
|
|
|
|
entry = lxb_unicode_normalization_entry_by_cp(cp);
|
|
|
|
if (!lxb_unicode_normalization_is_null(entry)) {
|
|
if (entry->quick & uc->quick_type) {
|
|
goto ok_true;
|
|
}
|
|
|
|
if (entry->ccc < uc->quick_ccc) {
|
|
goto ok_true;
|
|
}
|
|
|
|
uc->quick_ccc = entry->ccc;
|
|
}
|
|
else if (uc->quick_type & (LXB_UNICODE_QUICK_NFD_NO|LXB_UNICODE_QUICK_NFKD_NO)
|
|
&& cp >= lxb_unicode_sb && cp <= lxb_unicode_sl)
|
|
{
|
|
goto ok_true;
|
|
}
|
|
}
|
|
|
|
if (is_last) {
|
|
uc->quick_ccc = 0;
|
|
}
|
|
|
|
return false;
|
|
|
|
ok_true:
|
|
|
|
uc->quick_ccc = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
lxb_unicode_quick_check_end(lxb_unicode_normalizer_t *uc)
|
|
{
|
|
return lxb_unicode_quick_check(uc, NULL, 0, true);
|
|
}
|
|
|
|
bool
|
|
lxb_unicode_quick_check_cp(lxb_unicode_normalizer_t *uc,
|
|
const lxb_codepoint_t *cps, size_t length,
|
|
bool is_last)
|
|
{
|
|
lxb_codepoint_t cp;
|
|
const lxb_codepoint_t *end;
|
|
const lxb_unicode_normalization_entry_t *entry;
|
|
|
|
end = cps + length;
|
|
|
|
while (cps < end) {
|
|
cp = *cps++;
|
|
|
|
entry = lxb_unicode_normalization_entry_by_cp(cp);
|
|
|
|
if (!lxb_unicode_normalization_is_null(entry)) {
|
|
if (entry->quick & uc->quick_type) {
|
|
goto ok_true;
|
|
}
|
|
|
|
if (entry->ccc < uc->quick_ccc) {
|
|
goto ok_true;
|
|
}
|
|
|
|
uc->quick_ccc = entry->ccc;
|
|
}
|
|
else if (uc->quick_type & (LXB_UNICODE_QUICK_NFD_NO|LXB_UNICODE_QUICK_NFKD_NO)
|
|
&& cp >= lxb_unicode_sb && cp <= lxb_unicode_sl)
|
|
{
|
|
goto ok_true;
|
|
}
|
|
}
|
|
|
|
if (is_last) {
|
|
uc->quick_ccc = 0;
|
|
}
|
|
|
|
return false;
|
|
|
|
ok_true:
|
|
|
|
uc->quick_ccc = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
lxb_unicode_quick_check_cp_end(lxb_unicode_normalizer_t *uc)
|
|
{
|
|
return lxb_unicode_quick_check_cp(uc, NULL, 0, true);
|
|
}
|
|
|
|
static void
|
|
lxb_unicode_canonical(lxb_unicode_buffer_t *starter, lxb_unicode_buffer_t *op,
|
|
lxb_unicode_buffer_t *p)
|
|
{
|
|
lxb_unicode_reorder(op, starter);
|
|
lxb_unicode_canonical_composition(starter, p);
|
|
}
|
|
|
|
static void
|
|
lxb_unicode_compatibility(lxb_unicode_buffer_t *starter,
|
|
lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p)
|
|
{
|
|
(void) p;
|
|
lxb_unicode_reorder(op, starter);
|
|
}
|
|
|
|
static void
|
|
lxb_unicode_canonical_composition(lxb_unicode_buffer_t *p,
|
|
const lxb_unicode_buffer_t *end)
|
|
{
|
|
lxb_codepoint_t cp;
|
|
lxb_unicode_buffer_t *starter;
|
|
const lxb_unicode_normalization_entry_t *entry;
|
|
const lxb_unicode_composition_cp_t *centry;
|
|
|
|
/* p is a starter. */
|
|
|
|
starter = p++;
|
|
|
|
while (p < end) {
|
|
if (p->cp == LXB_ENCODING_ERROR_CODEPOINT) {
|
|
p += 1;
|
|
continue;
|
|
}
|
|
|
|
if (p[-1].ccc != 0 && p[-1].ccc >= p->ccc) {
|
|
p += 1;
|
|
continue;
|
|
}
|
|
|
|
centry = lxb_unicode_compose_entry(starter->cp, p->cp);
|
|
|
|
if (centry != NULL) {
|
|
if (!centry->exclusion) {
|
|
entry = lxb_unicode_normalization_entry_by_cp(centry->cp);
|
|
|
|
starter->cp = centry->cp;
|
|
starter->ccc = entry->ccc;
|
|
|
|
p->cp = LXB_ENCODING_ERROR_CODEPOINT;
|
|
p->ccc = 0;
|
|
}
|
|
}
|
|
else {
|
|
cp = lxb_unicode_entry_compose_hangul(starter->cp, p->cp);
|
|
|
|
if (cp != LXB_ENCODING_ERROR_CODEPOINT) {
|
|
starter->cp = cp;
|
|
starter->ccc = 0;
|
|
|
|
p->cp = LXB_ENCODING_ERROR_CODEPOINT;
|
|
p->ccc = 0;
|
|
}
|
|
}
|
|
|
|
p += 1;
|
|
}
|
|
}
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_canonical_decomposition(lxb_unicode_normalizer_t *uc,
|
|
lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end)
|
|
{
|
|
return lxb_unicode_decomposition(uc,cp, buf, end,
|
|
LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF);
|
|
}
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_compatibility_decomposition(lxb_unicode_normalizer_t *uc,
|
|
lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end)
|
|
{
|
|
return lxb_unicode_decomposition(uc,cp, buf, end,
|
|
LXB_UNICODE_DECOMPOSITION_TYPE__LAST_ENTRY);
|
|
}
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_decomposition(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end,
|
|
lxb_unicode_decomposition_type_t type)
|
|
{
|
|
size_t i, length;
|
|
lxb_unicode_buffer_t *p;
|
|
const lxb_codepoint_t *mapping;
|
|
const lxb_unicode_normalization_entry_t *entry;
|
|
|
|
entry = lxb_unicode_normalization_entry_by_cp(cp);
|
|
|
|
if (!lxb_unicode_normalization_is_null(entry) && entry->length > 0
|
|
&& LXB_UNICODE_DECOMPOSITION_TYPE(entry->type) <= type)
|
|
{
|
|
if (type == LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF) {
|
|
mapping = lxb_unicode_full_canonical(entry, &length);
|
|
}
|
|
else {
|
|
mapping = lxb_unicode_full_compatibility(entry, &length);
|
|
}
|
|
|
|
lxb_unicode_check_buf(uc, buf, end, length);
|
|
if (*buf == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
p = *buf;
|
|
|
|
for (i = 0; i < length; i++) {
|
|
entry = lxb_unicode_normalization_entry_by_cp(mapping[i]);
|
|
|
|
p->cp = mapping[i];
|
|
p->ccc = entry->ccc;
|
|
|
|
p += 1;
|
|
}
|
|
}
|
|
else if (cp >= lxb_unicode_sb && cp <= lxb_unicode_sl) {
|
|
return lxb_unicode_entry_decomposition_hangul(uc, buf, end, cp);
|
|
}
|
|
else {
|
|
lxb_unicode_check_buf(uc, buf, end, 1);
|
|
if (*buf == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
p = *buf;
|
|
|
|
p->cp = cp;
|
|
p->ccc = entry->ccc;
|
|
|
|
p += 1;
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
static lxb_unicode_buffer_t *
|
|
lxb_unicode_entry_decomposition_hangul(lxb_unicode_normalizer_t *uc,
|
|
lxb_unicode_buffer_t **buf,
|
|
const lxb_unicode_buffer_t **end,
|
|
lxb_codepoint_t cp)
|
|
{
|
|
lxb_unicode_buffer_t *p;
|
|
lxb_codepoint_t sid = cp - lxb_unicode_sb;
|
|
lxb_codepoint_t tid = sid % lxb_unicode_tc;
|
|
lxb_codepoint_t x = floorf((sid - tid) / lxb_unicode_tc);
|
|
|
|
lxb_unicode_check_buf(uc, buf, end, 2 + (tid != 0));
|
|
if (*buf == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
p = *buf;
|
|
|
|
p->cp = lxb_unicode_lb + floorf(x / lxb_unicode_vc);
|
|
p->ccc = 0;
|
|
p += 1;
|
|
|
|
p->cp = lxb_unicode_vb + (x % lxb_unicode_vc);
|
|
p->ccc = 0;
|
|
p += 1;
|
|
|
|
if (tid != 0) {
|
|
p->cp = lxb_unicode_ts + tid;
|
|
p->ccc = 0;
|
|
p += 1;
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
static lxb_codepoint_t
|
|
lxb_unicode_entry_compose_hangul(lxb_codepoint_t first, lxb_codepoint_t second)
|
|
{
|
|
|
|
if (first >= lxb_unicode_lb && first <= lxb_unicode_ll
|
|
&& second >= lxb_unicode_vb && second <= lxb_unicode_vl)
|
|
{
|
|
return lxb_unicode_sb
|
|
+ (((first - lxb_unicode_lb)
|
|
* lxb_unicode_vc) + second - lxb_unicode_vb)
|
|
* lxb_unicode_tc;
|
|
}
|
|
|
|
if (first >= lxb_unicode_sb && first <= lxb_unicode_sl
|
|
&& (first - lxb_unicode_sb) % lxb_unicode_tc == 0
|
|
&& second >= lxb_unicode_tb && second <= lxb_unicode_tl)
|
|
{
|
|
return first + second - lxb_unicode_ts;
|
|
}
|
|
|
|
return LXB_ENCODING_ERROR_CODEPOINT;
|
|
}
|
|
|
|
const lxb_unicode_composition_cp_t *
|
|
lxb_unicode_compose_entry(lxb_codepoint_t first, lxb_codepoint_t second)
|
|
{
|
|
return lxb_unicode_composition_cp(first, second);
|
|
}
|
|
|
|
lxb_unicode_idna_type_t
|
|
lxb_unicode_idna_type(lxb_codepoint_t cp)
|
|
{
|
|
const lxb_unicode_idna_entry_t *idna;
|
|
const lxb_unicode_entry_t *entry = lxb_unicode_entry(cp);
|
|
|
|
if (entry->idna == 0) {
|
|
return LXB_UNICODE_IDNA_DISALLOWED;
|
|
}
|
|
|
|
idna = lxb_unicode_idna_entry(entry);
|
|
|
|
return idna->type;
|
|
}
|
|
|
|
/*
|
|
* This function generated by the script "utils/lexbor/unicode/build.pl"!
|
|
* Run script and copy output.
|
|
*/
|
|
const lxb_unicode_entry_t *
|
|
lxb_unicode_entry(lxb_codepoint_t cp)
|
|
{
|
|
if (cp > 0x10FFFE) {
|
|
return &lxb_unicode_entries[0];
|
|
}
|
|
|
|
if (cp < 205744) {
|
|
if (cp < 83527) {
|
|
if (cp < 57345) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_0_57345[cp - 0]];
|
|
}
|
|
else if (cp >= 63743) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_63743_83527[cp - 63743]];
|
|
}
|
|
}
|
|
else if (cp >= 90368) {
|
|
if (cp < 101641) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_90368_101641[cp - 90368]];
|
|
}
|
|
else if (cp >= 110576) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_110576_205744[cp - 110576]];
|
|
}
|
|
}
|
|
}
|
|
else if (cp >= 917505) {
|
|
if (cp < 983041) {
|
|
if (cp < 918000) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_917505_918000[cp - 917505]];
|
|
}
|
|
else if (cp >= 983040) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_983040_983041[cp - 983040]];
|
|
}
|
|
}
|
|
else if (cp >= 1048573) {
|
|
if (cp < 1048577) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_1048573_1048577[cp - 1048573]];
|
|
}
|
|
else if (cp >= 1114109) {
|
|
if (cp < 1114110) {
|
|
return &lxb_unicode_entries[lxb_unicode_table_map_1114109_1114110[cp - 1114109]];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return &lxb_unicode_entries[0];
|
|
}
|
|
|
|
const lxb_unicode_composition_cp_t *
|
|
lxb_unicode_composition_cp(lxb_codepoint_t first, lxb_codepoint_t second)
|
|
{
|
|
const lxb_unicode_normalization_entry_t *norm;
|
|
const lxb_unicode_composition_entry_t *comp;
|
|
const lxb_unicode_composition_cp_t *cps;
|
|
|
|
norm = lxb_unicode_normalization_entry(lxb_unicode_entry(first));
|
|
comp = &lxb_unicode_composition_entries[norm->composition];
|
|
|
|
if (second >= comp->cp && second < comp->cp + comp->length) {
|
|
cps = &lxb_unicode_composition_cps[comp->index + second - comp->cp];
|
|
|
|
return (cps->cp != 0x00) ? cps : NULL;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
const lxb_unicode_normalization_entry_t *
|
|
lxb_unicode_normalization_entry(const lxb_unicode_entry_t *entry)
|
|
{
|
|
return &lxb_unicode_normalization_entries[entry->normalization];
|
|
}
|
|
|
|
const lxb_unicode_normalization_entry_t *
|
|
lxb_unicode_normalization_entry_by_cp(lxb_codepoint_t cp)
|
|
{
|
|
const lxb_unicode_entry_t *entry = lxb_unicode_entry(cp);
|
|
return &lxb_unicode_normalization_entries[entry->normalization];
|
|
}
|
|
|
|
const lxb_unicode_normalization_entry_t *
|
|
lxb_unicode_normalization_entry_by_index(uint16_t index)
|
|
{
|
|
return &lxb_unicode_normalization_entries[index];
|
|
}
|
|
|
|
bool
|
|
lxb_unicode_normalization_is_null(const lxb_unicode_normalization_entry_t *entry)
|
|
{
|
|
return entry == &lxb_unicode_normalization_entries[0];
|
|
}
|
|
|
|
const lxb_codepoint_t *
|
|
lxb_unicode_full_canonical(const lxb_unicode_normalization_entry_t *entry,
|
|
size_t *out_length)
|
|
{
|
|
const uint32_t *can;
|
|
|
|
if (LXB_UNICODE_IS_CANONICAL_SEPARATELY(entry->type)) {
|
|
can = &lxb_unicode_decomposition_cps[entry->decomposition] + entry->length;
|
|
|
|
*out_length = (size_t) *can;
|
|
|
|
return can + 1;
|
|
}
|
|
|
|
return lxb_unicode_full_compatibility(entry, out_length);
|
|
}
|
|
|
|
const lxb_codepoint_t *
|
|
lxb_unicode_full_compatibility(const lxb_unicode_normalization_entry_t *entry,
|
|
size_t *out_length)
|
|
{
|
|
if (entry->length > 0) {
|
|
*out_length = (size_t) entry->length;
|
|
|
|
return &lxb_unicode_decomposition_cps[entry->decomposition];
|
|
}
|
|
|
|
*out_length = 0;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
const lxb_unicode_idna_entry_t *
|
|
lxb_unicode_idna_entry(const lxb_unicode_entry_t *entry)
|
|
{
|
|
return &lxb_unicode_idna_entries[entry->idna];
|
|
}
|
|
|
|
const lxb_unicode_idna_entry_t *
|
|
lxb_unicode_idna_entry_by_cp(lxb_codepoint_t cp)
|
|
{
|
|
return &lxb_unicode_idna_entries[lxb_unicode_entry(cp)->idna];
|
|
}
|
|
|
|
const lxb_unicode_idna_entry_t *
|
|
lxb_unicode_idna_entry_by_index(uint16_t index)
|
|
{
|
|
return &lxb_unicode_idna_entries[index];
|
|
}
|
|
|
|
const lxb_codepoint_t *
|
|
lxb_unicode_idna_map(const lxb_unicode_idna_entry_t *entry,
|
|
size_t *out_length)
|
|
{
|
|
if (entry->length > 0) {
|
|
*out_length = (size_t) entry->length;
|
|
|
|
return &lxb_unicode_idna_cps[entry->index];
|
|
}
|
|
|
|
*out_length = 0;
|
|
|
|
return NULL;
|
|
}
|