Browse Source

[Fix] Fix some corner cases of single-host urls parsing

pull/4572/head
Vsevolod Stakhov 2 years ago
parent
commit
d029c2a400
No known key found for this signature in database GPG Key ID: 7647B6790081437
  1. 87
      src/libserver/url.c

87
src/libserver/url.c

@ -1,11 +1,11 @@
/*-
* Copyright 2016 Vsevolod Stakhov
/*
* Copyright 2023 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@ -14,31 +14,6 @@
* limitations under the License.
*/
/*
* Copyright (C) 2002-2015 Igor Sysoev
* Copyright (C) 2011-2015 Nginx, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "url.h"
#include "util.h"
@ -193,8 +168,9 @@ struct url_matcher static_matchers[] = {
{"ftp.", "ftp://", url_web_start, url_web_end,
0},
/* Likely emails */
{"@", "mailto://", url_email_start, url_email_end,
0}};
{
"@", "mailto://", url_email_start, url_email_end,
0}};
struct rspamd_url_flag_name {
const gchar *name;
@ -1817,7 +1793,7 @@ rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af
}
static gboolean
rspamd_url_is_ip(struct rspamd_url *uri, rspamd_mempool_t *pool)
rspamd_url_maybe_regenerate_from_ip(struct rspamd_url *uri, rspamd_mempool_t *pool)
{
const gchar *p, *end, *c;
gchar *errstr;
@ -2214,7 +2190,7 @@ rspamd_url_parse(struct rspamd_url *uri,
struct http_parser_url u;
gchar *p;
const gchar *end;
guint i, complen, ret, flags = 0;
guint complen, ret, flags = 0;
gsize unquoted_len = 0;
memset(uri, 0, sizeof(*uri));
@ -2277,7 +2253,7 @@ rspamd_url_parse(struct rspamd_url *uri,
p + u.field_data[UF_SCHEMA].len + 1,
len - 2 - u.field_data[UF_SCHEMA].len);
/* Compensate slashes added */
for (i = UF_SCHEMA + 1; i < UF_MAX; i++) {
for (int i = UF_SCHEMA + 1; i < UF_MAX; i++) {
if (u.field_set & (1 << i)) {
u.field_data[i].off += 2;
}
@ -2291,7 +2267,7 @@ rspamd_url_parse(struct rspamd_url *uri,
uri->urllen = len;
uri->flags = flags;
for (i = 0; i < UF_MAX; i++) {
for (guint i = 0; i < UF_MAX; i++) {
if (u.field_set & (1 << i)) {
guint shift = u.field_data[i].off;
complen = u.field_data[i].len;
@ -2458,7 +2434,7 @@ rspamd_url_parse(struct rspamd_url *uri,
rspamd_url_shift(uri, unquoted_len, UF_HOST);
if (uri->protocol == PROTOCOL_UNKNOWN) {
for (i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) {
for (int i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) {
if (uri->protocollen == rspamd_url_protocols[i].len) {
if (memcmp(uri->string,
rspamd_url_protocols[i].name, uri->protocollen) == 0) {
@ -2481,21 +2457,50 @@ rspamd_url_parse(struct rspamd_url *uri,
/*
* If we have not detected eSLD, but there are no dots in the hostname,
* then we should treat the whole hostname as eSLD - a rule of thumb
*
* We also check that a hostname ends with a permitted character, and all characters are forming
* DNS label. We also need to check for a numeric IP within this check.
*/
if (uri->hostlen > 0 && memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen) == NULL) {
uri->tldlen = uri->hostlen;
uri->tldshift = uri->hostshift;
const char *dot_pos = memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen);
bool is_whole_hostname_tld = false;
if (uri->hostlen > 0 && (dot_pos == NULL || dot_pos == rspamd_url_host_unsafe(uri) + uri->hostlen - 1)) {
bool all_chars_domain = true;
for (int i = 0; i < uri->hostlen; i++) {
if (!is_domain(rspamd_url_host_unsafe(uri)[i])) {
all_chars_domain = false;
break;
}
}
if (all_chars_domain) {
/* Also check the last character to be either a dot or alphanumeric character */
char last_c = rspamd_url_host_unsafe(uri)[uri->hostlen - 1];
if (last_c != '.' && !g_ascii_isalnum(last_c)) {
all_chars_domain = false;
}
}
if (all_chars_domain) {
/* Additionally check for a numeric IP as we can have some number here... */
rspamd_url_maybe_regenerate_from_ip(uri, pool);
uri->tldlen = uri->hostlen;
uri->tldshift = uri->hostshift;
is_whole_hostname_tld = true;
}
}
else {
if (!is_whole_hostname_tld) {
if (uri->protocol != PROTOCOL_MAILTO) {
if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) {
/* Ignore URL's without TLD if it is not a numeric URL */
if (!rspamd_url_is_ip(uri, pool)) {
if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
return URI_ERRNO_TLD_MISSING;
}
}
else {
if (!rspamd_url_is_ip(uri, pool)) {
if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) {
/* Assume tld equal to host */
uri->tldshift = uri->hostshift;
uri->tldlen = uri->hostlen;

Loading…
Cancel
Save