Browse Source
Add Lexbor files for URL handling (#18656)
Add Lexbor files for URL handling (#18656)
Relates to #14461 and https://wiki.php.net/rfc/url_parsing_apipull/18436/merge
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 210811 additions and 1 deletions
-
3codecov.yml
-
8ext/lexbor/config.m4
-
5ext/lexbor/config.w32
-
30ext/lexbor/lexbor/punycode/base.h
-
671ext/lexbor/lexbor/punycode/punycode.c
-
109ext/lexbor/lexbor/punycode/punycode.h
-
157ext/lexbor/lexbor/unicode/base.h
-
738ext/lexbor/lexbor/unicode/idna.c
-
264ext/lexbor/lexbor/unicode/idna.h
-
201955ext/lexbor/lexbor/unicode/res.h
-
1039ext/lexbor/lexbor/unicode/unicode.c
-
405ext/lexbor/lexbor/unicode/unicode.h
-
32ext/lexbor/lexbor/url/base.h
-
4845ext/lexbor/lexbor/url/url.c
-
551ext/lexbor/lexbor/url/url.h
@ -0,0 +1,30 @@ |
|||
/* |
|||
* Copyright (C) 2023-2024 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_PUNYCODE_BASE_H |
|||
#define LEXBOR_PUNYCODE_BASE_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/core/base.h" |
|||
|
|||
|
|||
#define LXB_PUNYCODE_VERSION_MAJOR 1 |
|||
#define LXB_PUNYCODE_VERSION_MINOR 1 |
|||
#define LXB_PUNYCODE_VERSION_PATCH 0 |
|||
|
|||
#define LEXBOR_PUNYCODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MAJOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MINOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_PATCH) |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_PUNYCODE_BASE_H */ |
@ -0,0 +1,671 @@ |
|||
/* |
|||
* Copyright (C) 2023-2024 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#include "lexbor/punycode/punycode.h" |
|||
#include "lexbor/encoding/encoding.h" |
|||
|
|||
|
|||
enum { |
|||
LXB_PUNYCODE_BASE = 36, |
|||
LXB_PUNYCODE_TMIN = 1, |
|||
LXB_PUNYCODE_TMAX = 26, |
|||
LXB_PUNYCODE_SKEW = 38, |
|||
LXB_PUNYCODE_DAMP = 700, |
|||
LXB_PUNYCODE_INITIAL_BIAS = 72, |
|||
LXB_PUNYCODE_INITIAL_N = 0x80, |
|||
LXB_PUNYCODE_DELIMITER = 0x2D |
|||
}; |
|||
|
|||
|
|||
static lxb_status_t |
|||
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx); |
|||
|
|||
|
|||
lxb_inline lxb_char_t * |
|||
lxb_punycode_encode_realloc(lxb_char_t *p, lxb_char_t **buf, |
|||
const lxb_char_t **end, const lxb_char_t *buffer) |
|||
{ |
|||
size_t cur_size = *end - *buf; |
|||
size_t nsize = cur_size * 2; |
|||
lxb_char_t *tmp; |
|||
|
|||
if (*buf == buffer) { |
|||
tmp = lexbor_malloc(nsize); |
|||
if (tmp == NULL) { |
|||
return NULL; |
|||
} |
|||
|
|||
memcpy(tmp, *buf, cur_size); |
|||
} |
|||
else { |
|||
tmp = lexbor_realloc(*buf, nsize); |
|||
if (tmp == NULL) { |
|||
return lexbor_free(*buf); |
|||
} |
|||
} |
|||
|
|||
*buf = tmp; |
|||
*end = tmp + nsize; |
|||
|
|||
return tmp + cur_size; |
|||
} |
|||
|
|||
lxb_inline lxb_codepoint_t * |
|||
lxb_punycode_decode_realloc(lxb_codepoint_t *p, lxb_codepoint_t **buf, |
|||
const lxb_codepoint_t **end, |
|||
const lxb_codepoint_t *buffer) |
|||
{ |
|||
size_t cur_size = *end - *buf; |
|||
size_t nsize = cur_size * 2; |
|||
lxb_codepoint_t *tmp; |
|||
|
|||
if (*buf == buffer) { |
|||
tmp = lexbor_malloc(nsize * sizeof(lxb_codepoint_t)); |
|||
if (tmp == NULL) { |
|||
return NULL; |
|||
} |
|||
|
|||
memcpy(tmp, *buf, cur_size * sizeof(lxb_codepoint_t)); |
|||
} |
|||
else { |
|||
tmp = lexbor_realloc(*buf, nsize * sizeof(lxb_codepoint_t)); |
|||
if (tmp == NULL) { |
|||
return lexbor_free(*buf); |
|||
} |
|||
} |
|||
|
|||
*buf = tmp; |
|||
*end = tmp + nsize; |
|||
|
|||
return tmp + cur_size; |
|||
} |
|||
|
|||
static char |
|||
lxb_punycode_encode_digit(size_t d) { |
|||
return d + 22 + 75 * (d < 26); |
|||
} |
|||
|
|||
static size_t |
|||
lxb_punycode_decode_digit(lxb_codepoint_t cp) |
|||
{ |
|||
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 |
|||
: cp - 97 < 26 ? cp - 97 : LXB_PUNYCODE_BASE; |
|||
} |
|||
|
|||
static size_t |
|||
lxb_punycode_adapt(size_t delta, size_t numpoints, bool firsttime) |
|||
{ |
|||
size_t k; |
|||
|
|||
delta = firsttime ? delta / LXB_PUNYCODE_DAMP : delta >> 1; |
|||
delta += delta / numpoints; |
|||
|
|||
for (k = 0; |
|||
delta > ((LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN) * LXB_PUNYCODE_TMAX) / 2; |
|||
k += LXB_PUNYCODE_BASE) |
|||
{ |
|||
delta /= LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN; |
|||
} |
|||
|
|||
return k + (LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN + 1) |
|||
* delta / (delta + LXB_PUNYCODE_SKEW); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_punycode_encode_body(const lxb_codepoint_t *cps, const lxb_codepoint_t *cps_end, |
|||
lxb_char_t *p, lxb_char_t *buf, const lxb_char_t *end, |
|||
const lxb_char_t *buffer, lxb_punycode_encode_cb_f cb, |
|||
void *ctx) |
|||
{ |
|||
bool unchanged; |
|||
size_t h, b, n, q, k, t, delta, bias; |
|||
lxb_status_t status; |
|||
lxb_codepoint_t cp, m; |
|||
const lxb_codepoint_t *cps_t, *cps_p; |
|||
|
|||
n = LXB_PUNYCODE_INITIAL_N; |
|||
bias = LXB_PUNYCODE_INITIAL_BIAS; |
|||
delta = 0; |
|||
b = p - buf; |
|||
cps_p = cps + b; |
|||
|
|||
if (cps_p >= cps_end) { |
|||
unchanged = true; |
|||
goto done; |
|||
} |
|||
|
|||
if (p > buf) { |
|||
*p++ = LXB_PUNYCODE_DELIMITER; |
|||
} |
|||
|
|||
unchanged = false; |
|||
|
|||
while (cps_p < cps_end) { |
|||
m = UINT32_MAX; |
|||
cps_t = cps; |
|||
|
|||
while (cps_t < cps_end) { |
|||
cp = *cps_t++; |
|||
|
|||
if (cp >= n && cp < m) { |
|||
m = cp; |
|||
} |
|||
} |
|||
|
|||
h = (cps_p - cps) + 1; |
|||
|
|||
if (m - n > (UINT32_MAX - delta) / h) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto failed; |
|||
} |
|||
|
|||
delta += (m - n) * h; |
|||
n = m; |
|||
|
|||
cps_t = cps; |
|||
|
|||
while (cps_t < cps_end) { |
|||
cp = *cps_t++; |
|||
|
|||
if (cp < n) { |
|||
if (++delta == 0) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto failed; |
|||
} |
|||
} |
|||
|
|||
if (cp == n) { |
|||
q = delta; |
|||
k = LXB_PUNYCODE_BASE; |
|||
|
|||
for (;; k += LXB_PUNYCODE_BASE) { |
|||
t = k <= bias ? LXB_PUNYCODE_TMIN : |
|||
k >= bias + LXB_PUNYCODE_TMAX |
|||
? LXB_PUNYCODE_TMAX : k - bias; |
|||
|
|||
if (q < t) { |
|||
break; |
|||
} |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*p++ = lxb_punycode_encode_digit(t + (q - t) |
|||
% (LXB_PUNYCODE_BASE - t)); |
|||
q = (q - t) / (LXB_PUNYCODE_BASE - t); |
|||
} |
|||
|
|||
h = cps_p - cps; |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*p++ = lxb_punycode_encode_digit(q); |
|||
bias = lxb_punycode_adapt(delta, h + 1, h == b); |
|||
delta = 0; |
|||
cps_p += 1; |
|||
} |
|||
} |
|||
|
|||
delta += 1; |
|||
n += 1; |
|||
} |
|||
|
|||
done: |
|||
|
|||
status = cb(buf, p - buf, ctx, unchanged); |
|||
|
|||
failed: |
|||
|
|||
if (buf != buffer) { |
|||
(void) lexbor_free(buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_punycode_encode(const lxb_char_t *data, size_t length, |
|||
lxb_punycode_encode_cb_f cb, void *ctx) |
|||
{ |
|||
size_t cp_length; |
|||
uint8_t len; |
|||
lxb_char_t *p, *buf; |
|||
lxb_status_t status; |
|||
lxb_codepoint_t cp, *cps, *cps_p; |
|||
const lxb_char_t *data_p, *data_end, *end; |
|||
const lxb_codepoint_t *cps_end; |
|||
lxb_char_t buffer[4096]; |
|||
lxb_codepoint_t input[4096]; |
|||
|
|||
/* |
|||
* Make GCC happy. |
|||
* length variable can be 0. |
|||
*/ |
|||
input[0] = 0x00; |
|||
|
|||
p = buffer; |
|||
buf = buffer; |
|||
end = buffer + sizeof(buffer); |
|||
|
|||
data_p = data; |
|||
data_end = data + length; |
|||
cp_length = 0; |
|||
|
|||
while (data_p < data_end) { |
|||
len = lxb_encoding_decode_utf_8_length(*data_p); |
|||
if (len == 0) { |
|||
return LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
} |
|||
|
|||
data_p += len; |
|||
cp_length += 1; |
|||
} |
|||
|
|||
if (cp_length <= sizeof(input) / sizeof(lxb_codepoint_t)) { |
|||
cps = input; |
|||
} |
|||
else { |
|||
cps = lexbor_malloc(cp_length * sizeof(lxb_codepoint_t)); |
|||
if (cps == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
data_p = data; |
|||
|
|||
cps_p = cps; |
|||
cps_end = cps + cp_length; |
|||
|
|||
while (data_p < data_end) { |
|||
cp = lxb_encoding_decode_valid_utf_8_single(&data_p, data_end); |
|||
if (cp == LXB_ENCODING_DECODE_ERROR) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
*cps_p++ = cp; |
|||
|
|||
if (cp < 0x80) { |
|||
if (p >= end) { |
|||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
status = LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
goto done; |
|||
} |
|||
} |
|||
|
|||
*p++ = cp; |
|||
} |
|||
} |
|||
|
|||
status = lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer, |
|||
cb, ctx); |
|||
done: |
|||
|
|||
if (cps != input) { |
|||
(void) lexbor_free(cps); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length, |
|||
lxb_punycode_encode_cb_f cb, void *ctx) |
|||
{ |
|||
lxb_char_t *p, *buf; |
|||
lxb_codepoint_t cp; |
|||
const lxb_char_t *end; |
|||
const lxb_codepoint_t *cps_p, *cps_end; |
|||
lxb_char_t buffer[4096]; |
|||
|
|||
p = buffer; |
|||
buf = buffer; |
|||
end = buffer + sizeof(buffer); |
|||
|
|||
cps_p = cps; |
|||
cps_end = cps + length; |
|||
|
|||
while (cps_p < cps_end) { |
|||
cp = *cps_p++; |
|||
|
|||
if (cp < 0x80) { |
|||
if (p >= end) { |
|||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*p++ = cp; |
|||
} |
|||
} |
|||
|
|||
return lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer, cb, ctx); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_punycode_decode(const lxb_char_t *data, size_t length, |
|||
lexbor_serialize_cb_f cb, void *ctx) |
|||
{ |
|||
lexbor_serialize_ctx_t nctx = {.cb = cb, .ctx = ctx}; |
|||
|
|||
return lxb_punycode_decode_cb_cp(data, length, lxb_punycode_callback_cp, |
|||
&nctx); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx) |
|||
{ |
|||
uint8_t i; |
|||
size_t length; |
|||
lxb_status_t status; |
|||
const lxb_codepoint_t *cps_p, *cps_end; |
|||
lexbor_serialize_ctx_t *nctx = ctx; |
|||
lxb_char_t *p, *buf, *end; |
|||
lxb_char_t buffer[4096]; |
|||
|
|||
/* |
|||
* Make GCC happy. |
|||
* len variable can be 0. |
|||
*/ |
|||
buffer[0] = 0x00; |
|||
|
|||
cps_p = cps; |
|||
cps_end = cps_p + len; |
|||
length = 0; |
|||
|
|||
while (cps_p < cps_end) { |
|||
i = lxb_encoding_encode_utf_8_length(*cps_p++); |
|||
if (i == 0) { |
|||
return LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
} |
|||
|
|||
length += i; |
|||
} |
|||
|
|||
buf = buffer; |
|||
end = buffer + sizeof(buffer); |
|||
|
|||
if (buf + length > end) { |
|||
buf = lexbor_malloc(length); |
|||
if (buf == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
|
|||
end = buf + length; |
|||
} |
|||
|
|||
p = buf; |
|||
cps_p = cps; |
|||
|
|||
while (cps_p < cps_end) { |
|||
(void) lxb_encoding_encode_utf_8_single(NULL, &p, end, *cps_p++); |
|||
} |
|||
|
|||
status = nctx->cb(buf, p - buf, nctx->ctx); |
|||
|
|||
if (buf != buffer) { |
|||
(void) lexbor_free(buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length, |
|||
lexbor_serialize_cb_cp_f cb, void *ctx) |
|||
{ |
|||
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in; |
|||
const lxb_codepoint_t *delimiter, *data_p, *data_end; |
|||
lxb_status_t status; |
|||
lxb_codepoint_t cp, n; |
|||
lxb_codepoint_t *p, *buf; |
|||
const lxb_codepoint_t *end; |
|||
lxb_codepoint_t buffer[4096]; |
|||
|
|||
p = buffer; |
|||
buf = buffer; |
|||
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t); |
|||
end = buffer + buf_len; |
|||
|
|||
data_p = data; |
|||
data_end = data + length; |
|||
delimiter = data_end; |
|||
|
|||
while (delimiter != data) { |
|||
delimiter -= 1; |
|||
|
|||
if (*delimiter == LXB_PUNYCODE_DELIMITER) { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
while (data_p < delimiter) { |
|||
cp = *data_p++; |
|||
|
|||
if (cp >= 0x80) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*p++ = cp; |
|||
} |
|||
|
|||
i = 0; |
|||
n = LXB_PUNYCODE_INITIAL_N; |
|||
bias = LXB_PUNYCODE_INITIAL_BIAS; |
|||
data_p = (delimiter != data) ? delimiter + 1: data; |
|||
in = data_p - data; |
|||
|
|||
for (; in < length; p++) { |
|||
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) { |
|||
if (in >= length) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
cp = data[in++]; |
|||
digit = lxb_punycode_decode_digit(cp); |
|||
|
|||
if (digit >= LXB_PUNYCODE_BASE) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
if (digit > (UINT32_MAX - i) / w) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
i += digit * w; |
|||
t = k <= bias ? LXB_PUNYCODE_TMIN |
|||
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias; |
|||
|
|||
if (digit < t) { |
|||
break; |
|||
} |
|||
|
|||
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
w *= (LXB_PUNYCODE_BASE - t); |
|||
} |
|||
|
|||
h = (p - buf) + 1; |
|||
|
|||
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0); |
|||
|
|||
if (i / h > UINT32_MAX - n) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
n += i / h; |
|||
i %= h; |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t)); |
|||
buf[i++] = n; |
|||
} |
|||
|
|||
status = cb(buf, p - buf, ctx); |
|||
|
|||
done: |
|||
|
|||
if (buffer != buf) { |
|||
(void) lexbor_free(buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length, |
|||
lexbor_serialize_cb_cp_f cb, void *ctx) |
|||
{ |
|||
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in; |
|||
const lxb_char_t *delimiter, *data_p, *data_end; |
|||
lxb_status_t status; |
|||
lxb_codepoint_t cp, n; |
|||
lxb_codepoint_t *p, *buf; |
|||
const lxb_codepoint_t *end; |
|||
lxb_codepoint_t buffer[4096]; |
|||
|
|||
p = buffer; |
|||
buf = buffer; |
|||
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t); |
|||
end = buffer + buf_len; |
|||
|
|||
data_p = data; |
|||
data_end = data + length; |
|||
delimiter = data_end; |
|||
|
|||
while (delimiter != data) { |
|||
delimiter -= 1; |
|||
|
|||
if (*delimiter == LXB_PUNYCODE_DELIMITER) { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
while (data_p < delimiter) { |
|||
cp = *data_p++; |
|||
|
|||
if (cp >= 0x80) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*p++ = cp; |
|||
} |
|||
|
|||
i = 0; |
|||
n = LXB_PUNYCODE_INITIAL_N; |
|||
bias = LXB_PUNYCODE_INITIAL_BIAS; |
|||
data_p = (delimiter != data) ? delimiter + 1: data; |
|||
in = data_p - data; |
|||
|
|||
for (; in < length; p++) { |
|||
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) { |
|||
if (in >= length) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
cp = data[in++]; |
|||
digit = lxb_punycode_decode_digit(cp); |
|||
|
|||
if (digit >= LXB_PUNYCODE_BASE) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
|
|||
if (digit > (UINT32_MAX - i) / w) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
i += digit * w; |
|||
t = k <= bias ? LXB_PUNYCODE_TMIN |
|||
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias; |
|||
|
|||
if (digit < t) { |
|||
break; |
|||
} |
|||
|
|||
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
w *= (LXB_PUNYCODE_BASE - t); |
|||
} |
|||
|
|||
h = (p - buf) + 1; |
|||
|
|||
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0); |
|||
|
|||
if (i / h > UINT32_MAX - n) { |
|||
status = LXB_STATUS_ERROR_OVERFLOW; |
|||
goto done; |
|||
} |
|||
|
|||
n += i / h; |
|||
i %= h; |
|||
|
|||
if (p >= end) { |
|||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer); |
|||
if (p == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t)); |
|||
buf[i++] = n; |
|||
} |
|||
|
|||
status = cb(buf, p - buf, ctx); |
|||
|
|||
done: |
|||
|
|||
if (buffer != buf) { |
|||
(void) lexbor_free(buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
@ -0,0 +1,109 @@ |
|||
/* |
|||
* Copyright (C) 2023 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_PUNYCODE_H |
|||
#define LEXBOR_PUNYCODE_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/punycode/base.h" |
|||
|
|||
|
|||
typedef lxb_status_t |
|||
(*lxb_punycode_encode_cb_f)(const lxb_char_t *data, size_t len, void *ctx, |
|||
bool unchanged); |
|||
|
|||
|
|||
/* |
|||
* Punycode: A Bootstring encoding of Unicode |
|||
* for Internationalized Domain Names in Applications (IDNA). |
|||
* |
|||
* https://www.rfc-editor.org/rfc/inline-errata/rfc3492.html |
|||
*/ |
|||
|
|||
/* |
|||
* Encoding from characters to characters. |
|||
* |
|||
* @param[in] Input characters for encoding. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results. Сalled only once when encoding is complete. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_punycode_encode(const lxb_char_t *data, size_t length, |
|||
lxb_punycode_encode_cb_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Encoding from code points to characters. |
|||
* |
|||
* Same as lxb_punycode_encode() only the input is code points. |
|||
* |
|||
* @param[in] Input code points for encoding. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Callback for results. Сalled only once when encoding is complete. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length, |
|||
lxb_punycode_encode_cb_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Decoding from characters to characters. |
|||
* |
|||
* @param[in] Input characters for encoding. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results. Сalled only once when encoding is complete. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_punycode_decode(const lxb_char_t *data, size_t length, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Decoding from code points to code points. |
|||
* |
|||
* Same as lxb_punycode_decode() only the input/output is code points. |
|||
* |
|||
* @param[in] Input code points for encoding. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Callback for results. Сalled only once when encoding is complete. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length, |
|||
lexbor_serialize_cb_cp_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Decoding from characters to code points. |
|||
* |
|||
* Same as lxb_punycode_decode() only the output is code points. |
|||
* |
|||
* @param[in] Input code points for encoding. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Callback for results. Сalled only once when encoding is complete. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length, |
|||
lexbor_serialize_cb_cp_f cb, void *ctx); |
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_PUNYCODE_H */ |
@ -0,0 +1,157 @@ |
|||
/* |
|||
* Copyright (C) 2023-2024 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_UNICODE_BASE_H |
|||
#define LEXBOR_UNICODE_BASE_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/core/base.h" |
|||
#include "lexbor/core/str.h" |
|||
|
|||
|
|||
#define LXB_UNICODE_VERSION_MAJOR 0 |
|||
#define LXB_UNICODE_VERSION_MINOR 3 |
|||
#define LXB_UNICODE_VERSION_PATCH 0 |
|||
|
|||
#define LXB_UNICODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MAJOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MINOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_PATCH) |
|||
|
|||
|
|||
enum { |
|||
LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF = 0x00, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_CIRCLE, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_COMPAT, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_FINAL, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_FONT, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_FRACTION, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_INITIAL, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_ISOLATED, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_MEDIAL, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_NARROW, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_NOBREAK, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_SMALL, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_SQUARE, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_SUB, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_SUPER, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_VERTICAL, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE_WIDE, |
|||
LXB_UNICODE_DECOMPOSITION_TYPE__LAST_ENTRY |
|||
}; |
|||
#define LXB_UNICODE_CANONICAL_SEPARATELY (1 << 7) |
|||
#define LXB_UNICODE_IS_CANONICAL_SEPARATELY(a) ((a) >> 7) |
|||
#define LXB_UNICODE_DECOMPOSITION_TYPE(a) ((a) & ~(1 << 7)) |
|||
typedef uint8_t lxb_unicode_decomposition_type_t; |
|||
|
|||
enum { |
|||
LXB_UNICODE_QUICK__UNDEF = 0x00, |
|||
LXB_UNICODE_QUICK_NFC_MAYBE = 1 << 0, |
|||
LXB_UNICODE_QUICK_NFC_NO = 1 << 1, |
|||
LXB_UNICODE_QUICK_NFD_NO = 1 << 2, |
|||
LXB_UNICODE_QUICK_NFKC_MAYBE = 1 << 3, |
|||
LXB_UNICODE_QUICK_NFKC_NO = 1 << 4, |
|||
LXB_UNICODE_QUICK_NFKD_NO = 1 << 5 |
|||
}; |
|||
typedef uint8_t lxb_unicode_quick_type_t; |
|||
|
|||
enum { |
|||
LXB_UNICODE_IDNA__UNDEF = 0x00, |
|||
LXB_UNICODE_IDNA_DEVIATION, |
|||
LXB_UNICODE_IDNA_DISALLOWED, |
|||
LXB_UNICODE_IDNA_IGNORED, |
|||
LXB_UNICODE_IDNA_MAPPED, |
|||
LXB_UNICODE_IDNA_VALID |
|||
}; |
|||
typedef uint8_t lxb_unicode_idna_type_t; |
|||
|
|||
typedef struct lxb_unicode_normalizer lxb_unicode_normalizer_t; |
|||
|
|||
typedef struct { |
|||
lxb_codepoint_t cp; |
|||
uint8_t ccc; |
|||
} |
|||
lxb_unicode_buffer_t; |
|||
|
|||
typedef lxb_status_t |
|||
(*lxb_unicode_nf_handler_f)(lxb_unicode_normalizer_t *uc, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
bool is_last); |
|||
|
|||
typedef lxb_unicode_buffer_t * |
|||
(*lxb_unicode_de_handler_f)(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp, |
|||
lxb_unicode_buffer_t **buf, |
|||
const lxb_unicode_buffer_t **end); |
|||
|
|||
typedef void |
|||
(*lxb_unicode_co_handler_f)(lxb_unicode_buffer_t *starter, |
|||
lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p); |
|||
|
|||
|
|||
typedef struct { |
|||
uint16_t normalization; /* lxb_unicode_normalization_t */ |
|||
uint16_t idna; /* lxb_unicode_idna_t */ |
|||
} |
|||
lxb_unicode_entry_t; |
|||
|
|||
typedef struct { |
|||
lxb_unicode_decomposition_type_t type; |
|||
lxb_unicode_quick_type_t quick; /* Quick Check. */ |
|||
uint8_t ccc; /* Canonical Combining Class. */ |
|||
uint8_t length; |
|||
uint16_t decomposition; /* lxb_codepoint_t */ |
|||
uint16_t composition; /* lxb_unicode_composition_entry_t */ |
|||
} |
|||
lxb_unicode_normalization_entry_t; |
|||
|
|||
typedef struct { |
|||
lxb_unicode_idna_type_t type; |
|||
uint8_t length; |
|||
uint16_t index; |
|||
} |
|||
lxb_unicode_idna_entry_t; |
|||
|
|||
typedef struct { |
|||
uint8_t length; /* Length in lxb_unicode_composition_cps_t */ |
|||
uint16_t index; /* lxb_unicode_composition_cps_t */ |
|||
lxb_codepoint_t cp; /* Begin code point in lxb_unicode_composition_cps_t */ |
|||
} |
|||
lxb_unicode_composition_entry_t; |
|||
|
|||
typedef struct { |
|||
lxb_codepoint_t cp; |
|||
bool exclusion; |
|||
} |
|||
lxb_unicode_composition_cp_t; |
|||
|
|||
struct lxb_unicode_normalizer { |
|||
lxb_unicode_de_handler_f decomposition; |
|||
lxb_unicode_co_handler_f composition; |
|||
|
|||
lxb_unicode_buffer_t *starter; |
|||
|
|||
lxb_unicode_buffer_t *buf; |
|||
const lxb_unicode_buffer_t *end; |
|||
lxb_unicode_buffer_t *p; |
|||
lxb_unicode_buffer_t *ican; |
|||
|
|||
lxb_char_t tmp[4]; |
|||
uint8_t tmp_lenght; |
|||
|
|||
uint8_t quick_ccc; |
|||
lxb_unicode_quick_type_t quick_type; |
|||
|
|||
size_t flush_cp; |
|||
}; |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_UNICODE_BASE_H */ |
@ -0,0 +1,738 @@ |
|||
/* |
|||
* Copyright (C) 2023 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#include "lexbor/unicode/idna.h" |
|||
#include "lexbor/unicode/unicode.h" |
|||
#include "lexbor/punycode/punycode.h" |
|||
#include "lexbor/encoding/encoding.h" |
|||
|
|||
|
|||
typedef struct { |
|||
lxb_unicode_idna_cb_f cb; |
|||
void *context; |
|||
lxb_unicode_idna_flag_t flags; |
|||
} |
|||
lxb_unicode_idna_ctx_t; |
|||
|
|||
typedef struct { |
|||
lxb_char_t buffer[4096]; |
|||
lxb_char_t *p; |
|||
lxb_char_t *buf; |
|||
const lxb_char_t *end; |
|||
lxb_unicode_idna_flag_t flags; |
|||
} |
|||
lxb_unicode_idna_ascii_ctx_t; |
|||
|
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t len, lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps, |
|||
const lxb_codepoint_t *p, |
|||
lxb_unicode_idna_ctx_t *context); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len, |
|||
void *ctx, lxb_status_t status); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx, |
|||
bool unchanged); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len, |
|||
void *ctx, lxb_status_t status); |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t length, lexbor_serialize_cb_f cb, |
|||
void *ctx, lxb_unicode_idna_flag_t flags, |
|||
bool is_cp); |
|||
|
|||
static bool |
|||
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp); |
|||
|
|||
lxb_unicode_idna_t * |
|||
lxb_unicode_idna_create(void) |
|||
{ |
|||
return lexbor_malloc(sizeof(lxb_unicode_idna_t)); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_init(lxb_unicode_idna_t *idna) |
|||
{ |
|||
if (idna == NULL) { |
|||
return LXB_STATUS_ERROR_OBJECT_IS_NULL; |
|||
} |
|||
|
|||
return lxb_unicode_normalizer_init(&idna->normalizer, LXB_UNICODE_NFC); |
|||
} |
|||
|
|||
void |
|||
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna) |
|||
{ |
|||
lxb_unicode_normalizer_clean(&idna->normalizer); |
|||
} |
|||
|
|||
lxb_unicode_idna_t * |
|||
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy) |
|||
{ |
|||
if (idna == NULL) { |
|||
return NULL; |
|||
} |
|||
|
|||
(void) lxb_unicode_normalizer_destroy(&idna->normalizer, false); |
|||
|
|||
if (self_destroy) { |
|||
return lexbor_free(idna); |
|||
} |
|||
|
|||
return idna; |
|||
} |
|||
|
|||
lxb_codepoint_t * |
|||
lxb_unicode_idna_realloc(lxb_codepoint_t *buf, const lxb_codepoint_t *buffer, |
|||
lxb_codepoint_t **buf_p, lxb_codepoint_t **buf_end, |
|||
size_t len) |
|||
{ |
|||
size_t nlen; |
|||
lxb_codepoint_t *tmp; |
|||
|
|||
nlen = ((*buf_end - buf) * 4) + len; |
|||
|
|||
if (buf == buffer) { |
|||
tmp = lexbor_malloc(nlen * sizeof(lxb_codepoint_t)); |
|||
if (tmp == NULL) { |
|||
return NULL; |
|||
} |
|||
} |
|||
else { |
|||
tmp = lexbor_realloc(buf, nlen * sizeof(lxb_codepoint_t)); |
|||
if (tmp == NULL) { |
|||
return lexbor_free(buf); |
|||
} |
|||
} |
|||
|
|||
*buf_p = tmp + (*buf_p - buf); |
|||
*buf_end = tmp + nlen; |
|||
|
|||
return tmp; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_processing_body(idna, data, length, cb, ctx, |
|||
flags, false); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna, |
|||
const lxb_codepoint_t *cps, size_t length, |
|||
lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_processing_body(idna, cps, length, cb, ctx, |
|||
flags, true); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t len, lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp) |
|||
{ |
|||
bool need; |
|||
size_t i, length; |
|||
lxb_status_t status; |
|||
lxb_codepoint_t cp, *buf, *buf_p, *buf_end; |
|||
const lxb_char_t *end, *p; |
|||
lxb_unicode_idna_type_t type; |
|||
const lxb_unicode_idna_entry_t *udata; |
|||
const lxb_codepoint_t *maps; |
|||
lxb_unicode_idna_ctx_t context; |
|||
lxb_codepoint_t buffer[4096]; |
|||
|
|||
buf = buffer; |
|||
buf_p = buffer; |
|||
buf_end = buffer + (sizeof(buffer) / sizeof(lxb_codepoint_t)); |
|||
|
|||
p = data; |
|||
len *= (is_cp) ? sizeof(lxb_codepoint_t) : 1; |
|||
end = (const lxb_char_t *) data + len; |
|||
|
|||
while (p < end) { |
|||
if (is_cp) { |
|||
cp = *((const lxb_codepoint_t *) p); |
|||
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1); |
|||
} |
|||
else { |
|||
cp = lxb_encoding_decode_valid_utf_8_single(&p, end); |
|||
if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) { |
|||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
goto done; |
|||
} |
|||
} |
|||
|
|||
type = lxb_unicode_idna_type(cp); |
|||
|
|||
again: |
|||
|
|||
switch (type) { |
|||
case LXB_UNICODE_IDNA_IGNORED: |
|||
break; |
|||
|
|||
case LXB_UNICODE_IDNA_MAPPED: |
|||
udata = lxb_unicode_idna_entry_by_cp(cp); |
|||
maps = lxb_unicode_idna_map(udata, &length); |
|||
|
|||
if (buf_p + length > buf_end) { |
|||
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p, |
|||
&buf_end, length); |
|||
if (buf == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
for (i = 0; i < length; i++) { |
|||
*buf_p++ = maps[i]; |
|||
} |
|||
|
|||
break; |
|||
|
|||
case LXB_UNICODE_IDNA_DEVIATION: |
|||
if ((flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) { |
|||
type = LXB_UNICODE_IDNA_MAPPED; |
|||
goto again; |
|||
} |
|||
|
|||
/* Fall through. */ |
|||
|
|||
case LXB_UNICODE_IDNA_DISALLOWED: |
|||
/* Fall through. */ |
|||
|
|||
case LXB_UNICODE_IDNA_VALID: |
|||
default: |
|||
if (buf_p >= buf_end) { |
|||
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p, |
|||
&buf_end, 1); |
|||
if (buf == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
} |
|||
|
|||
*buf_p++ = cp; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
context.cb = cb; |
|||
context.context = ctx; |
|||
context.flags = flags; |
|||
|
|||
|
|||
need = lxb_unicode_quick_check_cp(&idna->normalizer, buf, buf_p - buf, |
|||
true); |
|||
if (need) { |
|||
lxb_unicode_flush_count_set(&idna->normalizer, UINT32_MAX); |
|||
|
|||
status = lxb_unicode_normalize_cp(&idna->normalizer, buf, buf_p - buf, |
|||
lxb_unicode_idna_norm_c_cb, |
|||
&context, true); |
|||
} |
|||
else { |
|||
status = lxb_unicode_idna_norm_c_cb(buf, buf_p - buf, &context); |
|||
} |
|||
|
|||
done: |
|||
|
|||
if (buf != buffer) { |
|||
(void) lexbor_free(buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx) |
|||
{ |
|||
lxb_status_t status; |
|||
lxb_unicode_idna_ctx_t *context = ctx; |
|||
const lxb_codepoint_t *p, *end; |
|||
|
|||
p = cps; |
|||
end = cps + len; |
|||
|
|||
while (p < end) { |
|||
/* U+002E ( . ) FULL STOP. */ |
|||
|
|||
if (*p == 0x002E) { |
|||
status = lxb_unicode_idna_norm_c_send(cps, p, context); |
|||
if (status != LXB_STATUS_OK) { |
|||
return status; |
|||
} |
|||
|
|||
cps = p + 1; |
|||
} |
|||
|
|||
p += 1; |
|||
} |
|||
|
|||
/* |
|||
* We need to call a zero-length callback if the last codepoint was a |
|||
* U+002E ( . ) FULL STOP. |
|||
* |
|||
* For example, "muuuu." will call for two callbacks. |
|||
* First: "muuuu". |
|||
* Second: "" -- empty string with length = 0. |
|||
*/ |
|||
|
|||
if (p > cps || (len >= 1 && p[-1] == '.')) { |
|||
return lxb_unicode_idna_norm_c_send(cps, p, context); |
|||
} |
|||
|
|||
return LXB_STATUS_OK; |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps, |
|||
const lxb_codepoint_t *p, |
|||
lxb_unicode_idna_ctx_t *context) |
|||
{ |
|||
bool cr; |
|||
lxb_status_t status; |
|||
|
|||
/* xn-- or Xn-- or xN-- or XN-- */ |
|||
|
|||
if (p - cps >= 4 |
|||
&& (cps[0] == 0x0078 || cps[0] == 0x0058) |
|||
&& (cps[1] == 0x006E || cps[1] == 0x004E) |
|||
&& cps[2] == 0x002D && cps[3] == 0x002D) |
|||
{ |
|||
cps += 4; |
|||
status = lxb_punycode_decode_cp(cps, p - cps, |
|||
lxb_unicode_idna_punycode_cb, |
|||
context); |
|||
if (status == LXB_STATUS_OK) { |
|||
return LXB_STATUS_OK; |
|||
} |
|||
|
|||
cps -= 4; |
|||
} |
|||
else { |
|||
status = LXB_STATUS_OK; |
|||
} |
|||
|
|||
cr = lxb_unicode_idna_validity_criteria_cp(cps, p - cps, context->flags); |
|||
if (!cr) { |
|||
return LXB_STATUS_ERROR_UNEXPECTED_RESULT; |
|||
} |
|||
|
|||
return context->cb(cps, p - cps, context->context, status); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx) |
|||
{ |
|||
bool cr; |
|||
lxb_unicode_idna_ctx_t *context = ctx; |
|||
lxb_unicode_idna_ascii_ctx_t *asc = context->context; |
|||
|
|||
cr = lxb_unicode_idna_validity_criteria_cp(cps, len, asc->flags); |
|||
if (!cr) { |
|||
return LXB_STATUS_ERROR_UNEXPECTED_RESULT; |
|||
} |
|||
|
|||
return context->cb(cps, len, context->context, LXB_STATUS_OK); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_to_ascii_body(idna, data, length, cb, ctx, |
|||
flags, false); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_to_ascii_body(idna, cps, length, cb, ctx, |
|||
flags, true); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp) |
|||
{ |
|||
size_t len; |
|||
lxb_status_t status; |
|||
lxb_unicode_idna_ascii_ctx_t context; |
|||
|
|||
context.p = context.buffer; |
|||
context.buf = context.buffer; |
|||
context.end = context.buf + sizeof(context.buffer); |
|||
context.flags = flags; |
|||
|
|||
if (!is_cp) { |
|||
status = lxb_unicode_idna_processing(idna, data, length, |
|||
lxb_unicode_idna_to_ascii_cb, |
|||
&context, flags); |
|||
} |
|||
else { |
|||
status = lxb_unicode_idna_processing_cp(idna, data, length, |
|||
lxb_unicode_idna_to_ascii_cb, |
|||
&context, flags); |
|||
} |
|||
|
|||
if (status != LXB_STATUS_OK) { |
|||
goto done; |
|||
} |
|||
|
|||
/* Remove last U+002E ( . ) FULL STOP. */ |
|||
|
|||
if (context.p > context.buf) { |
|||
context.p -= 1; |
|||
} |
|||
|
|||
len = context.p - context.buf; |
|||
|
|||
status = cb(context.buf, len, ctx); |
|||
|
|||
done: |
|||
|
|||
if (context.buf != context.buffer) { |
|||
(void) lexbor_free(context.buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len, |
|||
void *ctx, lxb_status_t status) |
|||
{ |
|||
if (status != LXB_STATUS_OK) { |
|||
return status; |
|||
} |
|||
|
|||
return lxb_punycode_encode_cp(part, len, lxb_unicode_idna_ascii_puny_cb, |
|||
ctx); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx, |
|||
bool unchanged) |
|||
{ |
|||
size_t nlen; |
|||
lxb_char_t *tmp; |
|||
lxb_unicode_idna_ascii_ctx_t *asc = ctx; |
|||
|
|||
static const lexbor_str_t prefix = lexbor_str("xn--"); |
|||
|
|||
if (asc->p + length + 6 > asc->end) { |
|||
nlen = ((asc->end - asc->buf) * 4) + length + 6; |
|||
|
|||
if (asc->buf == asc->buffer) { |
|||
tmp = lexbor_malloc(nlen); |
|||
} |
|||
else { |
|||
tmp = lexbor_realloc(asc->buf, nlen); |
|||
} |
|||
|
|||
if (tmp == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
|
|||
asc->p = tmp + (asc->p - asc->buf); |
|||
asc->buf = tmp; |
|||
asc->end = tmp + nlen; |
|||
} |
|||
|
|||
if (!unchanged) { |
|||
memcpy(asc->p, prefix.data, prefix.length); |
|||
asc->p += 4; |
|||
} |
|||
|
|||
memcpy(asc->p, data, length); |
|||
|
|||
asc->p += length; |
|||
*asc->p++ = '.'; |
|||
*asc->p = 0x00; |
|||
|
|||
return LXB_STATUS_OK; |
|||
} |
|||
|
|||
bool |
|||
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_validity_criteria_h(data, length, flags, false); |
|||
} |
|||
|
|||
bool |
|||
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_validity_criteria_h(data, length, flags, true); |
|||
} |
|||
|
|||
static bool |
|||
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags, bool is_cp) |
|||
{ |
|||
size_t len; |
|||
lxb_codepoint_t cp; |
|||
const lxb_codepoint_t *cps; |
|||
const lxb_char_t *p, *end; |
|||
lxb_unicode_idna_type_t type; |
|||
|
|||
p = data; |
|||
len = length * ((is_cp) ? sizeof(lxb_codepoint_t) : 1); |
|||
end = (const lxb_char_t *) data + len; |
|||
|
|||
if (flags & LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS) { |
|||
/* U+002D HYPHEN-MINUS */ |
|||
|
|||
if (is_cp) { |
|||
cps = data; |
|||
|
|||
if (length > 4) { |
|||
if (cps[3] == 0x002D || cps[4] == 0x002D) { |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
if (length >= 1) { |
|||
if (cps[0] == 0x002D || cps[length - 1] == 0x002D) { |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
else { |
|||
if (length > 4) { |
|||
if (p[3] == 0x002D || p[4] == 0x002D) { |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
if (length >= 1) { |
|||
if (p[0] == 0x002D || p[-1] == 0x002D) { |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
else if (length >= 4) { |
|||
if (is_cp) { |
|||
cps = data; |
|||
|
|||
if ( (cps[0] == 0x0078 || cps[0] == 0x0058) |
|||
&& (cps[1] == 0x006E || cps[1] == 0x004E) |
|||
&& cps[2] == 0x002D && cps[3] == 0x002D) |
|||
{ |
|||
return false; |
|||
} |
|||
} |
|||
else { |
|||
if ( (p[0] == 0x0078 || p[0] == 0x0058) |
|||
&& (p[1] == 0x006E || p[1] == 0x004E) |
|||
&& p[2] == 0x002D && p[3] == 0x002D) |
|||
{ |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
|
|||
while (p < end) { |
|||
if (!is_cp) { |
|||
cp = lxb_encoding_decode_valid_utf_8_single(&p, end); |
|||
if (cp == LXB_ENCODING_DECODE_ERROR) { |
|||
return false; |
|||
} |
|||
} |
|||
else { |
|||
cp = *((const lxb_codepoint_t *) p); |
|||
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1); |
|||
} |
|||
|
|||
/* U+002E ( . ) FULL STOP */ |
|||
|
|||
if (cp == 0x002E) { |
|||
return false; |
|||
} |
|||
|
|||
type = lxb_unicode_idna_type(cp); |
|||
|
|||
switch (type) { |
|||
case LXB_UNICODE_IDNA_VALID: |
|||
break; |
|||
|
|||
case LXB_UNICODE_IDNA_DEVIATION: |
|||
if (!(flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) { |
|||
break; |
|||
} |
|||
|
|||
/* Fall through. */ |
|||
|
|||
case LXB_UNICODE_IDNA_DISALLOWED: |
|||
case LXB_UNICODE_IDNA_IGNORED: |
|||
case LXB_UNICODE_IDNA_MAPPED: |
|||
default: |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
return true; |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, |
|||
void *ctx, lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_to_unicode_body(idna, data, length, cb, ctx, |
|||
flags, false); |
|||
} |
|||
|
|||
lxb_status_t |
|||
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna, |
|||
const lxb_codepoint_t *cps, |
|||
size_t length, lexbor_serialize_cb_f cb, |
|||
void *ctx, lxb_unicode_idna_flag_t flags) |
|||
{ |
|||
return lxb_unicode_idna_to_unicode_body(idna, cps, length, cb, ctx, |
|||
flags, true); |
|||
} |
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data, |
|||
size_t length, lexbor_serialize_cb_f cb, |
|||
void *ctx, lxb_unicode_idna_flag_t flags, |
|||
bool is_cp) |
|||
{ |
|||
size_t len; |
|||
lxb_status_t status; |
|||
lxb_unicode_idna_ascii_ctx_t context; |
|||
|
|||
context.p = context.buffer; |
|||
context.buf = context.buffer; |
|||
context.end = context.buf + sizeof(context.buffer); |
|||
context.flags = flags; |
|||
|
|||
if (!is_cp) { |
|||
status = lxb_unicode_idna_processing(idna, data, length, |
|||
lxb_unicode_idna_to_unicode_cb, |
|||
&context, flags); |
|||
} |
|||
else { |
|||
status = lxb_unicode_idna_processing_cp(idna, data, length, |
|||
lxb_unicode_idna_to_unicode_cb, |
|||
&context, flags); |
|||
} |
|||
|
|||
if (status != LXB_STATUS_OK) { |
|||
goto done; |
|||
} |
|||
|
|||
/* Remove last U+002E ( . ) FULL STOP. */ |
|||
|
|||
if (context.p > context.buf) { |
|||
context.p -= 1; |
|||
} |
|||
|
|||
len = context.p - context.buf; |
|||
|
|||
status = cb(context.buf, len, ctx); |
|||
|
|||
done: |
|||
|
|||
if (context.buf != context.buffer) { |
|||
(void) lexbor_free(context.buf); |
|||
} |
|||
|
|||
return status; |
|||
} |
|||
|
|||
|
|||
static lxb_status_t |
|||
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len, |
|||
void *ctx, lxb_status_t status) |
|||
{ |
|||
int8_t res; |
|||
size_t length, nlen; |
|||
lxb_char_t *tmp; |
|||
const lxb_codepoint_t *p, *end; |
|||
lxb_unicode_idna_ascii_ctx_t *asc = ctx; |
|||
|
|||
if (status != LXB_STATUS_OK) { |
|||
return status; |
|||
} |
|||
|
|||
p = part; |
|||
end = part + len; |
|||
|
|||
length = 0; |
|||
|
|||
while (p < end) { |
|||
res = lxb_encoding_encode_utf_8_length(*p++); |
|||
if (res == 0) { |
|||
return LXB_STATUS_ERROR_UNEXPECTED_DATA; |
|||
} |
|||
|
|||
length += res; |
|||
} |
|||
|
|||
if (asc->p + length + 2 > asc->end) { |
|||
nlen = ((asc->end - asc->buf) * 4) + length + 2; |
|||
|
|||
if (asc->buf == asc->buffer) { |
|||
tmp = lexbor_malloc(nlen); |
|||
} |
|||
else { |
|||
tmp = lexbor_realloc(asc->buf, nlen); |
|||
} |
|||
|
|||
if (tmp == NULL) { |
|||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION; |
|||
} |
|||
|
|||
asc->p = tmp + (asc->p - asc->buf); |
|||
asc->buf = tmp; |
|||
asc->end = tmp + nlen; |
|||
} |
|||
|
|||
p = part; |
|||
|
|||
while (p < end) { |
|||
(void) lxb_encoding_encode_utf_8_single(NULL, &asc->p, asc->end, *p++); |
|||
} |
|||
|
|||
*asc->p++ = '.'; |
|||
*asc->p = 0x00; |
|||
|
|||
return LXB_STATUS_OK; |
|||
} |
@ -0,0 +1,264 @@ |
|||
/* |
|||
* Copyright (C) 2023 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
* |
|||
* UNICODE IDNA COMPATIBILITY PROCESSING |
|||
* https://www.unicode.org/reports/tr46/ |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_UNICODE_IDNA_H |
|||
#define LEXBOR_UNICODE_IDNA_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/unicode/base.h" |
|||
|
|||
|
|||
typedef lxb_status_t |
|||
(*lxb_unicode_idna_cb_f)(const lxb_codepoint_t *part, size_t len, |
|||
void *ctx, lxb_status_t status); |
|||
|
|||
typedef enum { |
|||
LXB_UNICODE_IDNA_FLAG_UNDEF = 0x00, |
|||
LXB_UNICODE_IDNA_FLAG_USE_STD3ASCII_RULES = 1 << 1, |
|||
LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS = 1 << 2, |
|||
LXB_UNICODE_IDNA_FLAG_CHECK_BIDI = 1 << 3, /* Not implemented. */ |
|||
LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS = 1 << 4, /* Not implemented. */ |
|||
LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING = 1 << 5, |
|||
LXB_UNICODE_IDNA_FLAG_VERIFY_DNS_LENGTH = 1 << 6 |
|||
} |
|||
lxb_unicode_idna_flag_t; |
|||
|
|||
typedef struct { |
|||
lxb_unicode_normalizer_t normalizer; |
|||
} |
|||
lxb_unicode_idna_t; |
|||
|
|||
|
|||
/* |
|||
* Create lxb_unicode_idna_t object. |
|||
* |
|||
* @return lxb_unicode_idna_t * if successful, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_unicode_idna_t * |
|||
lxb_unicode_idna_create(void); |
|||
|
|||
/* |
|||
* Initialization of lxb_unicode_idna_t object. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. May be NULL, |
|||
* LXB_STATUS_ERROR_OBJECT_IS_NULL status will be returned. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_init(lxb_unicode_idna_t *idna); |
|||
|
|||
/* |
|||
* Clears the object. Returns to states as after initialization. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t * |
|||
*/ |
|||
LXB_API void |
|||
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna); |
|||
|
|||
/* |
|||
* Destroy lxb_unicode_idna_t object. |
|||
* |
|||
* Release of occupied resources. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. Can be NULL. |
|||
* @param[in] if false: only destroys internal buffers. |
|||
* if true: destroys the lxb_unicode_idna_t object and all internal buffers. |
|||
* |
|||
* @return lxb_unicode_idna_t * if self_destroy = false, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_unicode_idna_t * |
|||
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy); |
|||
|
|||
/* |
|||
* Domain name processing. |
|||
* |
|||
* Mapping, Normalization (NFC), Converting, Validating. |
|||
* |
|||
* Callback will be invoked at each level of the domain name. |
|||
* |
|||
* For example: |
|||
* lexbor.com -- there will be two callbacks, for "lexbor" and "com". |
|||
* |
|||
* https://www.unicode.org/reports/tr46/#Processing |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Domain name processing for code points. |
|||
* |
|||
* This function is exactly the same as lxb_unicode_idna_processing() only it |
|||
* takes code points instead of characters as input. |
|||
* |
|||
* * Please, see lxb_unicode_idna_processing() function. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input code points for processing. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna, |
|||
const lxb_codepoint_t *cps, size_t length, |
|||
lxb_unicode_idna_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Processing and converting domain name to ASCII. |
|||
* |
|||
* Does the same thing as lxb_unicode_idna_processing() + converts each part |
|||
* domain name to Punycode. |
|||
* |
|||
* Callback will be invoked only once in at end of processing. |
|||
* |
|||
* https://www.unicode.org/reports/tr46/#ToASCII |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Processing and converting domain name to ASCII for code points. |
|||
* |
|||
* This function is exactly the same as lxb_unicode_idna_to_ascii() only it |
|||
* takes code points instead of characters as input. |
|||
* |
|||
* Please, see lxb_unicode_idna_to_ascii() function. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Processing and converting domain name to Unicode. |
|||
* |
|||
* Does the same thing as lxb_unicode_idna_processing(). |
|||
* |
|||
* Callback will be invoked only once in at end of processing. |
|||
* |
|||
* https://www.unicode.org/reports/tr46/#ToUnicode |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Processing and converting domain name to Unicode for code points. |
|||
* |
|||
* This function is exactly the same as lxb_unicode_idna_to_unicode() only it |
|||
* takes code points instead of characters as input. |
|||
* |
|||
* Please, see lxb_unicode_idna_to_unicode() function. |
|||
* |
|||
* @param[in] lxb_unicode_idna_t *. |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of processing. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Validity Criteria. |
|||
* |
|||
* The function checks the domain name for validity according to a number of |
|||
* criteria. |
|||
* |
|||
* LXB_UNICODE_IDNA_FLAG_CHECK_BIDI and LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS |
|||
* not implemented. |
|||
* |
|||
* https://www.unicode.org/reports/tr46/#Validity_Criteria |
|||
* |
|||
* @param[in] Input characters for processing. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return true if valid, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
/* |
|||
* Validity Criteria. |
|||
* |
|||
* Same as lxb_unicode_idna_validity_criteria() only it takes codepoints as |
|||
* input. |
|||
* |
|||
* @param[in] Input codepoints for processing. Not NULL. |
|||
* @param[in] Length of codepoints. Can be 0. |
|||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*). |
|||
* |
|||
* @return true if valid, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length, |
|||
lxb_unicode_idna_flag_t flags); |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_UNICODE_IDNA_H */ |
201955
ext/lexbor/lexbor/unicode/res.h
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1039
ext/lexbor/lexbor/unicode/unicode.c
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,405 @@ |
|||
/* |
|||
* Copyright (C) 2023 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_UNICODE_H |
|||
#define LEXBOR_UNICODE_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/unicode/base.h" |
|||
#include "lexbor/unicode/idna.h" |
|||
#include "lexbor/core/array_obj.h" |
|||
|
|||
|
|||
typedef enum { |
|||
LXB_UNICODE_NFC = 0x00, /* Normalization Form C (NFC). */ |
|||
LXB_UNICODE_NFD = 0x01, /* Normalization Form D (NFD). */ |
|||
LXB_UNICODE_NFKC = 0x02, /* Normalization Form KC (NFKC). */ |
|||
LXB_UNICODE_NFKD = 0x03 /* Normalization Form KD (NFKD). */ |
|||
} |
|||
lxb_unicode_form_t; |
|||
|
|||
|
|||
/* |
|||
* Create lxb_unicode_normalizer_t object. |
|||
* |
|||
* @return lxb_unicode_normalizer_t * if successful, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_unicode_normalizer_t * |
|||
lxb_unicode_normalizer_create(void); |
|||
|
|||
/* |
|||
* Initialization of lxb_unicode_normalizer_t object. |
|||
* |
|||
* Support normalization forms: |
|||
* Normalization Form D (NFD): LXB_UNICODE_NFD |
|||
* Normalization Form C (NFC): LXB_UNICODE_NFC |
|||
* Normalization Form KD (NFKD): LXB_UNICODE_NFKD |
|||
* Normalization Form KC (NFKC): LXB_UNICODE_NFKC |
|||
* |
|||
* https://www.unicode.org/reports/tr15/ |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Normalization form. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalizer_init(lxb_unicode_normalizer_t *uc, |
|||
lxb_unicode_form_t form); |
|||
|
|||
/* |
|||
* Initialization of lxb_unicode_normalizer_t object. |
|||
* |
|||
* Clears the object. Returns to states as after initialization. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
*/ |
|||
LXB_API void |
|||
lxb_unicode_normalizer_clean(lxb_unicode_normalizer_t *uc); |
|||
|
|||
/* |
|||
* Destroy lxb_unicode_normalizer_t object. |
|||
* |
|||
* Release of occupied resources. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t *. Can be NULL. |
|||
* @param[in] if false: only destroys internal buffers. |
|||
* if true: destroys the lxb_unicode_normalizer_t object and all internal buffers. |
|||
* |
|||
* @return lxb_unicode_normalizer_t * if self_destroy = false, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_unicode_normalizer_t * |
|||
lxb_unicode_normalizer_destroy(lxb_unicode_normalizer_t *uc, bool self_destroy); |
|||
|
|||
/* |
|||
* Unicode normalization forms. |
|||
* |
|||
* This is a function with an implementation of the unicode normalization |
|||
* algorithm. |
|||
* |
|||
* The function is designed to work with a stream (chunks). |
|||
* |
|||
* Please, see examples for this function in examples/lexbor/unicode directory. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Input characters for normalization. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Callback for results of normalization. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Set to true if the last chunk or the only one chunk is processed. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalize(lxb_unicode_normalizer_t *uc, const lxb_char_t *data, |
|||
size_t length, lexbor_serialize_cb_f cb, void *ctx, |
|||
bool is_last); |
|||
|
|||
/* |
|||
* Unicode normalization end. |
|||
* |
|||
* The function is used to complete a normalization. |
|||
* Same as calling the lxb_unicode_normalize() function with is_last = true. |
|||
* |
|||
* Use this function only if you do not set is_last = true in |
|||
* the lxb_unicode_normalize() function. |
|||
* |
|||
* For example: |
|||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false); |
|||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false); |
|||
* lxb_unicode_normalize_end(uc); |
|||
* |
|||
* The same as: |
|||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false); |
|||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, true); |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Callback for results of normalization. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalize_end(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb, |
|||
void *ctx); |
|||
|
|||
/* |
|||
* Unicode normalization forms for code points. |
|||
* |
|||
* This function is exactly the same as lxb_unicode_normalize() only it takes |
|||
* code points instead of characters as input. |
|||
* |
|||
* Also, unlike the lxb_unicode_normalize() function, a callback will be called |
|||
* to return a code points, not characters. |
|||
* |
|||
* The function is designed to work with a stream (chunks). |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Input code points for normalization. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Callback for results of normalization. |
|||
* @param[in] Context for callback. |
|||
* @param[in] Set to true if the last chunk or the only one chunk is processed. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalize_cp(lxb_unicode_normalizer_t *uc, const lxb_codepoint_t *cps, |
|||
size_t length, lexbor_serialize_cb_cp_f cb, void *ctx, |
|||
bool is_last); |
|||
|
|||
/* |
|||
* Unicode normalization end for code points. |
|||
* |
|||
* This function is completely similar to lxb_unicode_normalize_end(), |
|||
* only it takes a function with code points as a callback function. |
|||
* |
|||
* Same as calling the lxb_unicode_normalize_cp() function with is_last = true. |
|||
* |
|||
* Use this function only if you do not set is_last = true in |
|||
* the lxb_unicode_normalize_cp() function. |
|||
* |
|||
* For example: |
|||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false); |
|||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false); |
|||
* lxb_unicode_normalize_cp_end(uc); |
|||
* |
|||
* The same as: |
|||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false); |
|||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, true); |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Callback for results of normalization. |
|||
* @param[in] Context for callback. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalize_cp_end(lxb_unicode_normalizer_t *uc, |
|||
lexbor_serialize_cb_cp_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Quick Check. |
|||
* |
|||
* The basic normalization algorithm is not simple and requires time |
|||
* and resources. |
|||
* This function checks relatively quickly if the text needs to be normalized. |
|||
* |
|||
* The function is designed to work with a stream (chunks). |
|||
* |
|||
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Input characters for checks. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] Set to true if the last chunk or the only one chunk is processed. |
|||
* |
|||
* @return true if it needs to be normalized, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_quick_check(lxb_unicode_normalizer_t *uc, const lxb_char_t *data, |
|||
size_t length, bool is_last); |
|||
|
|||
/* |
|||
* Quick Check End. |
|||
* |
|||
* The function is used to complete a quick check. |
|||
* Same as calling the lxb_unicode_quick_check() function with is_last = true. |
|||
* |
|||
* Use this function only if you do not set is_last = true in |
|||
* the lxb_unicode_quick_check() function. |
|||
* |
|||
* For example: |
|||
* is = lxb_unicode_quick_check(uc, data, length, false); |
|||
* is = lxb_unicode_quick_check(uc, data, length, false); |
|||
* is = lxb_unicode_quick_check_end(uc); |
|||
* |
|||
* The same as: |
|||
* is = lxb_unicode_quick_check(uc, data, length, false); |
|||
* is = lxb_unicode_quick_check(uc, data, length, true); |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* |
|||
* @return true if it needs to be normalized, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_quick_check_end(lxb_unicode_normalizer_t *uc); |
|||
|
|||
/* |
|||
* Quick Check for code points. |
|||
* |
|||
* Same as lxb_unicode_quick_check() only it takes code points as input. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* @param[in] Input code points for checks. Not NULL. |
|||
* @param[in] Length of code points. Can be 0. |
|||
* @param[in] Set to true if the last chunk or the only one chunk is processed. |
|||
* |
|||
* @return true if it needs to be normalized, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_quick_check_cp(lxb_unicode_normalizer_t *uc, |
|||
const lxb_codepoint_t *cps, size_t length, |
|||
bool is_last); |
|||
|
|||
/* |
|||
* Quick Check End for code points. |
|||
* |
|||
* Same as lxb_unicode_quick_check_end(). |
|||
* |
|||
* For example: |
|||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false); |
|||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false); |
|||
* is = lxb_unicode_quick_check_cp_end(uc); |
|||
* |
|||
* The same as: |
|||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false); |
|||
* is = lxb_unicode_quick_check_cp(uc, cps, length, true); |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t * |
|||
* |
|||
* @return true if it needs to be normalized, otherwise false. |
|||
*/ |
|||
LXB_API bool |
|||
lxb_unicode_quick_check_cp_end(lxb_unicode_normalizer_t *uc); |
|||
|
|||
/* |
|||
* Flush. |
|||
* |
|||
* Force flush the buffer to the user's callback if it possible. |
|||
* |
|||
* Please, see lxb_unicode_flush_count_set() function. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t *. |
|||
* @param[in] Callback. |
|||
* @param[in] Callback context. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_flush(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb, |
|||
void *ctx); |
|||
|
|||
/* |
|||
* Flush for code points. |
|||
* |
|||
* Same as lxb_unicode_flush(), but it takes a callback with code points as |
|||
* input. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t *. |
|||
* @param[in] Callback. |
|||
* @param[in] Callback context. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_flush_cp(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_cp_f cb, |
|||
void *ctx); |
|||
|
|||
/* |
|||
* Change normalization form. |
|||
* |
|||
* You should only apply this function after one of the following actions: |
|||
* 1. The lxb_unicode_normalize() function was called with is_last = true. |
|||
* That is, the processing of the previous type was successfully |
|||
* completed. |
|||
* OR |
|||
* 2. The end of normalization function was called: |
|||
lxb_unicode_normalize_end(). |
|||
* OR |
|||
* 3. The lxb_unicode_normalizer_t object cleanup function was called: |
|||
* lxb_unicode_normalizer_clean(). |
|||
* |
|||
* |
|||
* All this is to be able to normalize or quickly check text with different |
|||
* types without creating new objects. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t *. |
|||
* @param[in] Normalization form. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_unicode_normalization_form_set(lxb_unicode_normalizer_t *uc, |
|||
lxb_unicode_form_t form); |
|||
|
|||
LXB_API const lxb_unicode_entry_t * |
|||
lxb_unicode_entry(lxb_codepoint_t cp); |
|||
|
|||
LXB_API const lxb_unicode_composition_cp_t * |
|||
lxb_unicode_compose_entry(lxb_codepoint_t first, lxb_codepoint_t second); |
|||
|
|||
LXB_API lxb_unicode_idna_type_t |
|||
lxb_unicode_idna_type(lxb_codepoint_t cp); |
|||
|
|||
LXB_API const lxb_unicode_composition_cp_t * |
|||
lxb_unicode_composition_cp(lxb_codepoint_t first, lxb_codepoint_t second); |
|||
|
|||
LXB_API const lxb_unicode_normalization_entry_t * |
|||
lxb_unicode_normalization_entry(const lxb_unicode_entry_t *entry); |
|||
|
|||
LXB_API const lxb_unicode_normalization_entry_t * |
|||
lxb_unicode_normalization_entry_by_cp(lxb_codepoint_t cp); |
|||
|
|||
LXB_API const lxb_unicode_normalization_entry_t * |
|||
lxb_unicode_normalization_entry_by_index(uint16_t index); |
|||
|
|||
LXB_API bool |
|||
lxb_unicode_normalization_is_null(const lxb_unicode_normalization_entry_t *entry); |
|||
|
|||
LXB_API const lxb_codepoint_t * |
|||
lxb_unicode_full_canonical(const lxb_unicode_normalization_entry_t *entry, |
|||
size_t *out_length); |
|||
|
|||
LXB_API const lxb_codepoint_t * |
|||
lxb_unicode_full_compatibility(const lxb_unicode_normalization_entry_t *entry, |
|||
size_t *out_length); |
|||
|
|||
LXB_API const lxb_unicode_idna_entry_t * |
|||
lxb_unicode_idna_entry(const lxb_unicode_entry_t *entry); |
|||
|
|||
LXB_API const lxb_unicode_idna_entry_t * |
|||
lxb_unicode_idna_entry_by_cp(lxb_codepoint_t cp); |
|||
|
|||
LXB_API const lxb_unicode_idna_entry_t * |
|||
lxb_unicode_idna_entry_by_index(uint16_t index); |
|||
|
|||
LXB_API const lxb_codepoint_t * |
|||
lxb_unicode_idna_map(const lxb_unicode_idna_entry_t *entry, |
|||
size_t *out_length); |
|||
|
|||
/* |
|||
* Inline functions. |
|||
*/ |
|||
|
|||
/* |
|||
* Sets the buffer size for codepoints. |
|||
* |
|||
* By default, 4096 processed codepoints are accumulated before converting them |
|||
* to lxb_char_t and returning the result to the user via callback. |
|||
* |
|||
* If set the count to 0, the user callback will be called for every codepoint |
|||
* processed. That is, it will be streaming without accumulation in |
|||
* the intermediate buffer. |
|||
* |
|||
* @param[in] lxb_unicode_normalizer_t *. |
|||
* @param[in] Count of codepoints in the buffer. |
|||
*/ |
|||
lxb_inline void |
|||
lxb_unicode_flush_count_set(lxb_unicode_normalizer_t *uc, size_t count) |
|||
{ |
|||
uc->flush_cp = count; |
|||
} |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_UNICODE_H */ |
@ -0,0 +1,32 @@ |
|||
/* |
|||
* Copyright (C) 2023-2024 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_URL_BASE_H |
|||
#define LEXBOR_URL_BASE_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/core/base.h" |
|||
#include "lexbor/core/mraw.h" |
|||
#include "lexbor/core/str.h" |
|||
|
|||
|
|||
#define LXB_URL_VERSION_MAJOR 0 |
|||
#define LXB_URL_VERSION_MINOR 3 |
|||
#define LXB_URL_VERSION_PATCH 0 |
|||
|
|||
#define LXB_URL_VERSION_STRING LEXBOR_STRINGIZE(LXB_URL_VERSION_MAJOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_URL_VERSION_MINOR) "." \ |
|||
LEXBOR_STRINGIZE(LXB_URL_VERSION_PATCH) |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_URL_BASE_H */ |
4845
ext/lexbor/lexbor/url/url.c
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,551 @@ |
|||
/* |
|||
* Copyright (C) 2023 Alexander Borisov |
|||
* |
|||
* Author: Alexander Borisov <borisov@lexbor.com> |
|||
* |
|||
* The URL Standard. |
|||
* By specification: https://url.spec.whatwg.org/ |
|||
*/ |
|||
|
|||
#ifndef LEXBOR_URL_H |
|||
#define LEXBOR_URL_H |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#include "lexbor/url/base.h" |
|||
#include "lexbor/core/mraw.h" |
|||
#include "lexbor/core/plog.h" |
|||
#include "lexbor/encoding/encoding.h" |
|||
#include "lexbor/unicode/unicode.h" |
|||
|
|||
|
|||
typedef enum { |
|||
LXB_URL_ERROR_TYPE_DOMAIN_TO_ASCII = 0x00, |
|||
LXB_URL_ERROR_TYPE_DOMAIN_TO_UNICODE, |
|||
LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT, |
|||
LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT, |
|||
LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART, |
|||
LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS, |
|||
LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART, |
|||
LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART, |
|||
LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART, |
|||
LXB_URL_ERROR_TYPE_IPV6_UNCLOSED, |
|||
LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION, |
|||
LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES, |
|||
LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION, |
|||
LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT, |
|||
LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES, |
|||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES, |
|||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT, |
|||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART, |
|||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS, |
|||
LXB_URL_ERROR_TYPE_INVALID_URL_UNIT, |
|||
LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS, |
|||
LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL, |
|||
LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS, |
|||
LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS, |
|||
LXB_URL_ERROR_TYPE_HOST_MISSING, |
|||
LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE, |
|||
LXB_URL_ERROR_TYPE_PORT_INVALID, |
|||
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER, |
|||
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST, |
|||
LXB_URL_ERROR_TYPE__LAST_ENTRY |
|||
} |
|||
lxb_url_error_type_t; |
|||
|
|||
typedef enum { |
|||
LXB_URL_STATE__UNDEF = 0x00, |
|||
LXB_URL_STATE_SCHEME_START_STATE, |
|||
LXB_URL_STATE_SCHEME_STATE, |
|||
LXB_URL_STATE_NO_SCHEME_STATE, |
|||
LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE, |
|||
LXB_URL_STATE_PATH_OR_AUTHORITY_STATE, |
|||
LXB_URL_STATE_RELATIVE_STATE, |
|||
LXB_URL_STATE_RELATIVE_SLASH_STATE, |
|||
LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE, |
|||
LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE, |
|||
LXB_URL_STATE_AUTHORITY_STATE, |
|||
LXB_URL_STATE_HOST_STATE, |
|||
LXB_URL_STATE_HOSTNAME_STATE, |
|||
LXB_URL_STATE_PORT_STATE, |
|||
LXB_URL_STATE_FILE_STATE, |
|||
LXB_URL_STATE_FILE_SLASH_STATE, |
|||
LXB_URL_STATE_FILE_HOST_STATE, |
|||
LXB_URL_STATE_PATH_START_STATE, |
|||
LXB_URL_STATE_PATH_STATE, |
|||
LXB_URL_STATE_OPAQUE_PATH_STATE, |
|||
LXB_URL_STATE_QUERY_STATE, |
|||
LXB_URL_STATE_FRAGMENT_STATE |
|||
} |
|||
lxb_url_state_t; |
|||
|
|||
/* |
|||
* New values can only be added downwards. |
|||
* Before LXB_URL_SCHEMEL_TYPE__LAST_ENTRY. |
|||
* |
|||
* Please, see lxb_url_scheme_res in /lexbor/url/url.c. |
|||
*/ |
|||
typedef enum { |
|||
LXB_URL_SCHEMEL_TYPE__UNDEF = 0x00, |
|||
LXB_URL_SCHEMEL_TYPE__UNKNOWN = 0x01, |
|||
LXB_URL_SCHEMEL_TYPE_HTTP = 0x02, |
|||
LXB_URL_SCHEMEL_TYPE_HTTPS = 0x03, |
|||
LXB_URL_SCHEMEL_TYPE_WS = 0x04, |
|||
LXB_URL_SCHEMEL_TYPE_WSS = 0x05, |
|||
LXB_URL_SCHEMEL_TYPE_FTP = 0x06, |
|||
LXB_URL_SCHEMEL_TYPE_FILE = 0x07, |
|||
LXB_URL_SCHEMEL_TYPE__LAST_ENTRY |
|||
} |
|||
lxb_url_scheme_type_t; |
|||
|
|||
typedef struct { |
|||
const lexbor_str_t name; |
|||
uint16_t port; |
|||
lxb_url_scheme_type_t type; |
|||
} |
|||
lxb_url_scheme_data_t; |
|||
|
|||
typedef struct { |
|||
lexbor_str_t name; |
|||
lxb_url_scheme_type_t type; |
|||
} |
|||
lxb_url_scheme_t; |
|||
|
|||
typedef enum { |
|||
LXB_URL_HOST_TYPE__UNDEF = 0x00, |
|||
LXB_URL_HOST_TYPE_DOMAIN = 0x01, |
|||
LXB_URL_HOST_TYPE_OPAQUE = 0x02, |
|||
LXB_URL_HOST_TYPE_IPV4 = 0x03, |
|||
LXB_URL_HOST_TYPE_IPV6 = 0x04, |
|||
LXB_URL_HOST_TYPE_EMPTY = 0x05 |
|||
} |
|||
lxb_url_host_type_t; |
|||
|
|||
typedef struct { |
|||
lxb_url_host_type_t type; |
|||
|
|||
union { |
|||
uint16_t ipv6[8]; |
|||
uint32_t ipv4; |
|||
lexbor_str_t opaque; |
|||
lexbor_str_t domain; |
|||
} u; |
|||
} |
|||
lxb_url_host_t; |
|||
|
|||
typedef struct { |
|||
lexbor_str_t str; |
|||
size_t length; |
|||
bool opaque; |
|||
} |
|||
lxb_url_path_t; |
|||
|
|||
typedef struct { |
|||
lxb_url_scheme_t scheme; |
|||
|
|||
lxb_url_host_t host; |
|||
|
|||
lexbor_str_t username; |
|||
lexbor_str_t password; |
|||
|
|||
uint16_t port; |
|||
bool has_port; |
|||
|
|||
lxb_url_path_t path; |
|||
|
|||
lexbor_str_t query; |
|||
lexbor_str_t fragment; |
|||
|
|||
lexbor_mraw_t *mraw; |
|||
} |
|||
lxb_url_t; |
|||
|
|||
typedef struct { |
|||
lxb_url_t *url; |
|||
lexbor_mraw_t *mraw; |
|||
lexbor_plog_t *log; |
|||
|
|||
lxb_unicode_idna_t *idna; |
|||
} |
|||
lxb_url_parser_t; |
|||
|
|||
|
|||
/* |
|||
* Create lxb_url_parser_t object. |
|||
* |
|||
* @return lxb_url_parser_t * if successful, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_url_parser_t * |
|||
lxb_url_parser_create(void); |
|||
|
|||
/* |
|||
* Initialization of lxb_url_parser_t object. |
|||
* |
|||
* The parser is not bound to the received URLs in any way. That is, after |
|||
* parsing the lxb_url_parser_t object can be destroyed and we can continue |
|||
* working with the received URLs. |
|||
* |
|||
* Memory for created URLs is taken from lexbor_mraw_t object, which you can |
|||
* pass during initialization of lxb_url_parser_t object, or a new lexbor_mraw_t |
|||
* object will be created during initialization if NULL is passed. |
|||
* |
|||
* Each created URL will have a pointer to the lexbor_mraw_t object. |
|||
* |
|||
* By destroying the lexbor_mraw_t object you destroy all the URL objects |
|||
* created by the parser. Use the lxb_url_destroy() function to destroy a |
|||
* specific URL. |
|||
* |
|||
* Destroying the lxb_url_parser_t object with lxb_url_parser_destroy() does |
|||
* not destroy the lexbor_mraw_t memory object. |
|||
* |
|||
* Please, see functions lxb_url_parser_memory_destroy(), lxb_url_destroy(), |
|||
* lxb_url_memory_destroy(). |
|||
* |
|||
* @param[in] lxb_url_parser_t * |
|||
* @param[in] lexbor_mraw_t *. Can be NULL. If pass NULL, it will create its own |
|||
* memory object inside parser and it will be bound to all created URLs. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw); |
|||
|
|||
/* |
|||
* Clears the object. Returns object to states as after initialization. |
|||
* |
|||
* This function must be called before the parsing functions can be reused. |
|||
* |
|||
* For example: |
|||
* lxb_url_parse() |
|||
* lxb_url_parser_clean() |
|||
* lxb_url_parse() |
|||
* lxb_url_destroy() |
|||
* |
|||
* @param[in] lxb_url_parser_t * |
|||
*/ |
|||
LXB_API void |
|||
lxb_url_parser_clean(lxb_url_parser_t *parser); |
|||
|
|||
/* |
|||
* Destroy lxb_url_parser_t object. |
|||
* |
|||
* Release of occupied resources. |
|||
* The lexbor_mraw_t memory object is not destroyed in this function. |
|||
* |
|||
* @param[in] lxb_url_parser_t *. Can be NULL. |
|||
* @param[in] if false: only destroys internal buffers. |
|||
* if true: destroys the lxb_url_parser_t object and all internal buffers. |
|||
* |
|||
* @return lxb_url_parser_t * if self_destroy = false, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_url_parser_t * |
|||
lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self); |
|||
|
|||
/* |
|||
* Destroys the lexbor_mraw_t object, and thus all associated URLs. |
|||
* |
|||
* After that, new URLs cannot be parsed until a new lexbor_mraw_t object is |
|||
* assigned to the lxb_url_parser_t object. |
|||
* |
|||
* @param[in] lxb_url_parser_t *. |
|||
*/ |
|||
LXB_API void |
|||
lxb_url_parser_memory_destroy(lxb_url_parser_t *parser); |
|||
|
|||
/* |
|||
* URL parser. |
|||
* |
|||
* This functional an implementation of URL parsing according to the WHATWG |
|||
* specification. |
|||
* |
|||
* @param[in] lxb_url_parser_t *. |
|||
* @param[in] const lxb_url_t *. Base URL, can be NULL. |
|||
* @param[in] Input characters. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* |
|||
* @return lxb_url_t * if successful, otherwise NULL. |
|||
*/ |
|||
LXB_API lxb_url_t * |
|||
lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url, |
|||
const lxb_char_t *data, size_t length); |
|||
|
|||
/* |
|||
* URL basic parser. |
|||
* |
|||
* This functional an implementation of URL parsing according to the WHATWG |
|||
* specification. |
|||
* |
|||
* Use the lxb_url_get() function to get the URL object. |
|||
* |
|||
* @param[in] lxb_url_parser_t *. |
|||
* @param[in] lxb_url_t *. Can be NULL. |
|||
* @param[in] const lxb_url_t *. Base URL, can be NULL. |
|||
* @param[in] Input characters. Not NULL. |
|||
* @param[in] Length of characters. Can be 0. |
|||
* @param[in] lxb_url_state_t, for default set to LXB_URL_STATE__UNDEF. |
|||
* @param[in] lxb_encoding_t, default (LXB_ENCODING_DEFAULT) LXB_ENCODING_UTF_8. |
|||
* |
|||
* @return LXB_STATUS_OK if successful, otherwise an error status value. |
|||
*/ |
|||
LXB_API lxb_status_t |
|||
lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url, |
|||
const lxb_url_t *base_url, |
|||
const lxb_char_t *data, size_t length, |
|||
lxb_url_state_t override_state, lxb_encoding_t encoding); |
|||
|
|||
/* |
|||
* Erase URL. |
|||
* |
|||
* Frees all internal memory occupied by the URL object, but does not destroy |
|||
* the object. |
|||
* |
|||
* @param[in] lxb_url_t *. |
|||
* |
|||
* @return NULL. |
|||
*/ |
|||
LXB_API void |
|||
lxb_url_erase(lxb_url_t *url); |
|||
|
|||
/* |
|||
* Destroys URL. |
|||
* |
|||
* @param[in] lxb_url_t *. |
|||
* |
|||
* @return NULL. |
|||
*/ |
|||
LXB_API lxb_url_t * |
|||
lxb_url_destroy(lxb_url_t *url); |
|||
|
|||
/* |
|||
* Destroys the lexbor_mraw_t memory object. |
|||
* |
|||
* The function will destroy all URLs associated with the lexbor_mraw_t memory |
|||
* object, including the passed one. |
|||
* |
|||
* Keep in mind, if you have a live lxb_url_parser_t parsing object, you will |
|||
* have a pointer to garbage after calling this function instead of a pointer |
|||
* to the lexbor_mraw_t object. |
|||
* In this case you need to assign a new memory object lexbor_mraw_t for the |
|||
* parser. Use the lxb_url_mraw_set() function. |
|||
* |
|||
* @param[in] lxb_url_t *. |
|||
*/ |
|||
LXB_API void |
|||
lxb_url_memory_destroy(lxb_url_t *url); |
|||
|
|||
|
|||
/* |
|||
* Below is an API for modifying the URL object according to the |
|||
* https://url.spec.whatwg.org/#api specification. |
|||
* |
|||
* It is not necessary to pass the lxb_url_parser_t object to API functions. |
|||
* You need to pass the parser if you want to have logs of parsing. |
|||
* |
|||
* All API functions can be passed NULL as "const lxb_char_t *" data. |
|||
*/ |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *href, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *protocol, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_username_set(lxb_url_t *url, |
|||
const lxb_char_t *username, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_password_set(lxb_url_t *url, |
|||
const lxb_char_t *password, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *host, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *hostname, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *port, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *pathname, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *search, size_t length); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser, |
|||
const lxb_char_t *hash, size_t length); |
|||
|
|||
|
|||
/* |
|||
* Below are functions for serializing a URL object and its individual |
|||
* parameters. |
|||
* |
|||
* Note that the callback may be called more than once. |
|||
* For example, the lxb_url_serialize() function will callback multiple times: |
|||
* 1. http |
|||
* 2. :// |
|||
* 3. example.com |
|||
* and so on. |
|||
*/ |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx, |
|||
bool exclude_fragment); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_scheme(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_username(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_password(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_host(const lxb_url_host_t *host, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_host_unicode(lxb_unicode_idna_t *idna, |
|||
const lxb_url_host_t *host, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_host_ipv4(uint32_t ipv4, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_host_ipv6(const uint16_t *ipv6, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_port(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_path(const lxb_url_path_t *path, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_query(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
LXB_API lxb_status_t |
|||
lxb_url_serialize_fragment(const lxb_url_t *url, |
|||
lexbor_serialize_cb_f cb, void *ctx); |
|||
|
|||
/* |
|||
* Creates a clone of the object's URL. |
|||
* |
|||
* For lexbor_mraw_t *, use url->mraw or another lexbor_mraw_t * object. |
|||
* |
|||
* @param[in] lexbor_mraw_t *. |
|||
* @param[in] lxb_url_t *. |
|||
* |
|||
* @return a new URL object if successful, otherwise NULL value. |
|||
*/ |
|||
LXB_API lxb_url_t * |
|||
lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url); |
|||
|
|||
/* |
|||
* Inline functions. |
|||
*/ |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_scheme(const lxb_url_t *url) |
|||
{ |
|||
return &url->scheme.name; |
|||
} |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_username(const lxb_url_t *url) |
|||
{ |
|||
return &url->username; |
|||
} |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_password(const lxb_url_t *url) |
|||
{ |
|||
return &url->password; |
|||
} |
|||
|
|||
lxb_inline const lxb_url_host_t * |
|||
lxb_url_host(const lxb_url_t *url) |
|||
{ |
|||
return &url->host; |
|||
} |
|||
|
|||
lxb_inline uint16_t |
|||
lxb_url_port(const lxb_url_t *url) |
|||
{ |
|||
return url->port; |
|||
} |
|||
|
|||
lxb_inline bool |
|||
lxb_url_has_port(const lxb_url_t *url) |
|||
{ |
|||
return url->has_port; |
|||
} |
|||
|
|||
lxb_inline const lxb_url_path_t * |
|||
lxb_url_path(const lxb_url_t *url) |
|||
{ |
|||
return &url->path; |
|||
} |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_path_str(const lxb_url_t *url) |
|||
{ |
|||
return &url->path.str; |
|||
} |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_query(const lxb_url_t *url) |
|||
{ |
|||
return &url->query; |
|||
} |
|||
|
|||
lxb_inline const lexbor_str_t * |
|||
lxb_url_fragment(const lxb_url_t *url) |
|||
{ |
|||
return &url->fragment; |
|||
} |
|||
|
|||
lxb_inline lexbor_mraw_t * |
|||
lxb_url_mraw(lxb_url_parser_t *parser) |
|||
{ |
|||
return parser->mraw; |
|||
} |
|||
|
|||
lxb_inline void |
|||
lxb_url_mraw_set(lxb_url_parser_t *parser, lexbor_mraw_t *mraw) |
|||
{ |
|||
parser->mraw = mraw; |
|||
} |
|||
|
|||
lxb_inline lxb_url_t * |
|||
lxb_url_get(lxb_url_parser_t *parser) |
|||
{ |
|||
return parser->url; |
|||
} |
|||
|
|||
|
|||
#ifdef __cplusplus |
|||
} /* extern "C" */ |
|||
#endif |
|||
|
|||
#endif /* LEXBOR_URL_H */ |
Write
Preview
Loading…
Cancel
Save
Reference in new issue