A few minor Unicode collation customization improvements were made,

which makes it possible to add more world language collations with very complex collation rules (e.g. Myanmar): - Weight string for a single character in a user defined collation was erroneously limited to 7 weights (instead of 8 weights). Added an extra element in the user-defined weight arrays, to fit 8 non-zero weights. - Weight string limit for contractions was made two times longer (16 weights), which allows longer contractions without affecting the performance of filesort. - A user-defined collation now refuses to initialize and reports an error in case if a weight string gets longer than 8 weights for a single character, or longer than 16 weights for a contraction. Previously weight strings for such characters (and contractions) were cut, so a collation could silently start with wrong rules. - Fixed a bug in handling rules like "&a << b" in combination with shift-after-method="expand". The primary weight for "b" was not correctly calculated, which erroneously made "b" primary greater than "a" instead of primary equal to "a".
12 years ago · bd3dc54261
5 changed files with 136 additions and 48 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -88,13 +88,25 @@ extern MY_UNICASE_INFO my_unicase_mysql500;
 extern MY_UNICASE_INFO my_unicase_unicode520;

 #define MY_UCA_MAX_CONTRACTION 6
-#define MY_UCA_MAX_WEIGHT_SIZE 8
+/*
+  The DUCET tables in ctype-uca.c are dumped with a limit of 8 weights
+  per character. cs->strxfrm_multiply is set to 8 for all UCA based collations.
+
+  In language-specific UCA collations (with tailorings) we also do not allow
+  a single character to have more than 8 weights to stay with the same
+  strxfrm_multiply limit. Note, contractions are allowed to have twice longer
+  weight strings (up to 16 weights). As a contraction consists of at
+  least 2 characters, this makes sure that strxfrm_multiply ratio of 8
+  is respected.
+*/
+#define MY_UCA_MAX_WEIGHT_SIZE (8+1)               /* Including 0 terminator */
+#define MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE (2*8+1) /* Including 0 terminator */
 #define MY_UCA_WEIGHT_LEVELS   1

 typedef struct my_contraction_t
 {
  my_wc_t ch[MY_UCA_MAX_CONTRACTION];   /* Character sequence              */
-  uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
+  uint16 weight[MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
  my_bool with_context;
 } MY_CONTRACTION;

--- a/mysql-test/r/ctype_ldml.result
+++ b/mysql-test/r/ctype_ldml.result
@ -425,6 +425,7 @@ ucs2_test_ci	ucs2	358			8
 ucs2_vn_ci	ucs2	359			8
 ucs2_5624_1	ucs2	360			8
 utf8_5624_5	utf8	368			8
+utf8_5624_5_bad	utf8	369			8
 utf32_test_ci	utf32	391			8
 utf8_maxuserid_ci	utf8	2047			8
 show collation like '%test%';
@ -1030,9 +1031,12 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
 INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
 INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
 INSERT INTO t1 VALUES ('AA'),('AAA');
+INSERT INTO t1 VALUES ('001'),('002');
 SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
 a	HEX(WEIGHT_STRING(a))
 0	0E29
+001	0E29
+002	0E29
 0z	0E290E292357
 0ﾝ	0E291E81
 a	0E29233E
@ -1093,6 +1097,12 @@ AA	0E293358
 AAA	0E293359
 1	0E2A
 DROP TABLE t1;
+SET NAMES utf8 COLLATE utf8_5624_5_bad;
+ERROR HY000: Unknown collation: 'utf8_5624_5_bad'
+SHOW WARNINGS;
+Level	Code	Message
+Error	1273	Unknown collation: 'utf8_5624_5_bad'
+Warning	1273	Expansion too long: 'a\u002Daaaaaa10'
 #
 # End of WL#5624
 #
--- a/mysql-test/std_data/Index.xml
+++ b/mysql-test/std_data/Index.xml
@ -114,13 +114,25 @@
           weight space between 0 and 1 in DUCET.
           Also, to test it works with contractions, put some after 'z'.
        -->
-        <reset>0</reset>
+        <reset>0</reset><s>001</s><s>002</s>
        <pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p>
        <reset before="primary">1</reset>
        <pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p>
      </rules>
    </collation>

+    <collation name="utf8_5624_5_bad" id="369" shift-after-method="expand">
+      <rules>
+      <reset>a-a4</reset><p>xxx04</a>
+      <reset>a-aa5</reset><p>xxx05</a>
+      <reset>a-aaa6</reset><p>xxx06</a>
+      <reset>a-aaaa7</reset><p>xxx07</a>
+      <reset>a-aaaaa8</reset><p>xxx08</a>
+      <reset>a-aaaaaa9</reset><p>xxx09</a>
+      <reset>a-aaaaaa10</reset><p>xxx10</a>
+      </rules>
+    </collation>
+
   <collation name="utf8_hugeid_ci" id="2047000000">
      <rules>
        <reset>a</reset>
--- a/mysql-test/t/ctype_ldml.test
+++ b/mysql-test/t/ctype_ldml.test
@ -342,10 +342,14 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
 INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
 INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
 INSERT INTO t1 VALUES ('AA'),('AAA');
+INSERT INTO t1 VALUES ('001'),('002');

 SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
 DROP TABLE t1;

+--error ER_UNKNOWN_COLLATION
+SET NAMES utf8 COLLATE utf8_5624_5_bad;
+SHOW WARNINGS;

 --echo #
 --echo # End of WL#5624
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@ -8211,7 +8211,7 @@ ex:
  Collation rule item
 */

-#define MY_UCA_MAX_EXPANSION  6  /* Maximum expansion length   */
+#define MY_UCA_MAX_EXPANSION  10 /* Maximum expansion length   */

 typedef struct my_coll_rule_item_st
 {
@ -8821,42 +8821,6 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
                                            MY_UCA_MAX_EXPANSION, "Expansion"))
      return 0;
  }
-
-  if (p->rules->shift_after_method == my_shift_method_expand ||
-      p->rule.before_level == 1) /* Apply "before primary" option  */
-  {
-    /*
-      Suppose we have this rule:  &B[before primary] < C
-      i.e. we need to put C before B, but after A, so
-      the result order is: A < C < B.
-
-      Let primary weight of B be [BBBB].
-
-      We cannot just use [BBBB-1] as weight for C:
-      DUCET does not have enough unused weights between any two characters,
-      so using [BBBB-1] will likely make C equal to the previous character,
-      which is A, so we'll get this order instead of the desired: A = C < B.
-
-      To guarantee that that C is sorted after A, we'll use expansion
-      with a kind of "biggest possible character".
-      As "biggest possible character" we'll use "last_non_ignorable":
-
-      We'll compose weight for C as: [BBBB-1][MMMM+1]
-      where [MMMM] is weight for "last_non_ignorable".
-      
-      We also do the same trick for "reset after" if the collation
-      option says so. E.g. for the rules "&B < C", weight for
-      C will be calculated as: [BBBB][MMMM+1]
-
-      At this point we only need to store codepoints
-      'B' and 'last_non_ignorable'. Actual weights for 'C'
-      will be calculated according to the above formula later,
-      in create_tailoring().
-    */
-    if (!my_coll_rule_expand(p->rule.base, MY_UCA_MAX_EXPANSION,
-                             p->rules->uca->last_non_ignorable))
-      return my_coll_parser_too_long_error(p, "Expansion");
-  }
  return 1;
 }

@ -9056,20 +9020,25 @@ my_coll_rule_parse(MY_COLL_RULES *rules,
  @dst_uca    destination UCA weight data
  @to         destination address
  @to_length  size of destination
+  @nweights   OUT number of weights put to "to"
  @str        qide string
  @len        string length
  
-  @return    number of weights put
+  @return     FALSE on success, TRUE if the weights did not fit.
 */

-static size_t
+static my_bool
 my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
-                   uint16 *to, size_t to_length,
+                   uint16 *to, size_t to_length, size_t *nweights,
                   my_wc_t *str, size_t len)
 {
  size_t count;
+  int rc= FALSE;
  if (!to_length)
-    return 0;
+  {
+    *nweights= 0;
+    return len > 0;
+  }
  to_length--; /* Without trailing zero */

  for (count= 0; len; )
@ -9099,10 +9068,13 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
      *to++= *from++;
      count++;
    }
+    if (count == to_length && from && * from)
+      rc= TRUE; /* All weights did not fit */
  }

  *to= 0;
-  return count;
+  *nweights= count;
+  return rc;
 }


@ -9191,6 +9163,37 @@ apply_shift(MY_CHARSET_LOADER *loader,
 }


+static void
+wstr_to_str(char *str, size_t length, my_wc_t *wc, size_t wlength)
+{
+  const char *end= str + length;
+  char *s;
+  size_t i, rem;
+  for (s= str, i= 0; (rem= (end - s)) > 0 && i < wlength; i++)
+  {
+    if ((wc[i] >= '0' && wc[i] <= '9') ||
+        (wc[i] >= 'a' && wc[i] <= 'z') ||
+        (wc[i] >= 'A' && wc[i] <= 'Z'))
+      s+= my_snprintf(s, rem, "%c", (int) wc[i]);
+    else
+      s+= my_snprintf(s, rem, "\\u%04X", (int) wc[i]);
+  }
+}
+
+
+static void
+my_charset_loader_error_for_rule(MY_CHARSET_LOADER *loader, 
+                                 const MY_COLL_RULE *r,
+                                 const char *name,
+                                 my_wc_t *wc, size_t wlength)
+{
+  char tmp[128];
+  wstr_to_str(tmp, sizeof(tmp), wc, wlength);
+  my_snprintf(loader->error, sizeof(loader->error),
+              "%s too long: '%s'", name, tmp);
+}
+
+
 static my_bool
 apply_one_rule(MY_CHARSET_LOADER *loader,
               MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
@ -9200,6 +9203,47 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
  size_t nreset= my_coll_rule_reset_length(r); /* Length of reset sequence */
  size_t nshift= my_coll_rule_shift_length(r); /* Length of shift sequence */
  uint16 *to;
+  my_bool rc;
+
+  if ((rules->shift_after_method == my_shift_method_expand && r->diff[0]) ||
+      r->before_level == 1)
+  {
+    /*
+      Suppose we have this rule:  &B[before primary] < C
+      i.e. we need to put C before B, but after A, so
+      the result order is: A < C < B.
+
+      Let primary weight of B be [BBBB].
+
+      We cannot just use [BBBB-1] as weight for C:
+      DUCET does not have enough unused weights between any two characters,
+      so using [BBBB-1] will likely make C equal to the previous character,
+      which is A, so we'll get this order instead of the desired: A = C < B.
+
+      To guarantee that that C is sorted after A, we'll use expansion
+      with a kind of "biggest possible character".
+      As "biggest possible character" we'll use "last_non_ignorable":
+
+      We'll compose weight for C as: [BBBB-1][MMMM+1]
+      where [MMMM] is weight for "last_non_ignorable".
+      
+      We also do the same trick for "reset after" if the collation
+      option says so. E.g. for the rules "&B < C", weight for
+      C will be calculated as: [BBBB][MMMM+1]
+
+      At this point we only need to store codepoints
+      'B' and 'last_non_ignorable'. Actual weights for 'C'
+      will be calculated according to the above formula later,
+      in create_tailoring().
+    */
+    if (!my_coll_rule_expand(r->base, MY_UCA_MAX_EXPANSION,
+                             rules->uca->last_non_ignorable))
+    {
+      my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
+      return TRUE;
+    }
+    nreset= my_coll_rule_reset_length(r);
+  }

  if (nshift >= 2) /* Contraction */
  {
@ -9222,7 +9266,8 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
                               r->with_context)->weight;
    /* Store weights of the "reset to" character */
    dst->contractions.nitems--; /* Temporarily hide - it's incomplete */
-    nweights= my_char_weight_put(dst, to, MY_UCA_MAX_WEIGHT_SIZE,
+    rc= my_char_weight_put(dst,
+                           to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights,
                           r->base, nreset);
    dst->contractions.nitems++; /* Activate, now it's complete */
  }
@ -9232,7 +9277,12 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
    DBUG_ASSERT(dst->weights[pagec]);
    to= my_char_weight_addr(dst, r->curr[0]);
    /* Store weights of the "reset to" character */
-    nweights= my_char_weight_put(dst, to, dst->lengths[pagec], r->base, nreset);
+    rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset);
+  }
+  if (rc)
+  {
+    my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
+    return rc;
  }

  /* Apply level difference. */