mirror of https://github.com/MariaDB/server
				
				
			
			You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							427 lines
						
					
					
						
							11 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							427 lines
						
					
					
						
							11 KiB
						
					
					
				| /* Copyright (C) 2000 MySQL AB | |
|  | |
|    This program is free software; you can redistribute it and/or modify | |
|    it under the terms of the GNU General Public License as published by | |
|    the Free Software Foundation; version 2 of the License. | |
|  | |
|    This program is distributed in the hope that it will be useful, | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | |
|    GNU General Public License for more details. | |
|  | |
|    You should have received a copy of the GNU General Public License | |
|    along with this program; if not, write to the Free Software | |
|    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */ | |
| 
 | |
| #include <my_global.h> | |
| #include <m_ctype.h> | |
| #include <my_xml.h> | |
| #ifndef SCO | |
| #include <m_string.h> | |
| #endif | |
|  | |
| 
 | |
| /* | |
|  | |
|   This files implements routines which parse XML based | |
|   character set and collation description files. | |
|    | |
|   Unicode collations are encoded according to | |
|    | |
|     Unicode Technical Standard #35 | |
|     Locale Data Markup Language (LDML) | |
|     http://www.unicode.org/reports/tr35/ | |
|    | |
|   and converted into ICU string according to | |
|    | |
|     Collation Customization | |
|     http://oss.software.ibm.com/icu/userguide/Collate_Customization.html | |
|    | |
| */ | |
| 
 | |
| static char *mstr(char *str,const char *src,size_t l1,size_t l2) | |
| { | |
|   l1= l1<l2 ? l1 : l2; | |
|   memcpy(str,src,l1); | |
|   str[l1]='\0'; | |
|   return str; | |
| } | |
| 
 | |
| struct my_cs_file_section_st | |
| { | |
|   int        state; | |
|   const char *str; | |
| }; | |
| 
 | |
| #define _CS_MISC	1 | |
| #define _CS_ID		2 | |
| #define _CS_CSNAME	3 | |
| #define _CS_FAMILY	4 | |
| #define _CS_ORDER	5 | |
| #define _CS_COLNAME	6 | |
| #define _CS_FLAG	7 | |
| #define _CS_CHARSET	8 | |
| #define _CS_COLLATION	9 | |
| #define _CS_UPPERMAP	10 | |
| #define _CS_LOWERMAP	11 | |
| #define _CS_UNIMAP	12 | |
| #define _CS_COLLMAP	13 | |
| #define _CS_CTYPEMAP	14 | |
| #define _CS_PRIMARY_ID	15 | |
| #define _CS_BINARY_ID	16 | |
| #define _CS_CSDESCRIPT	17 | |
| #define _CS_RESET	18 | |
| #define	_CS_DIFF1	19 | |
| #define	_CS_DIFF2	20 | |
| #define	_CS_DIFF3	21 | |
|  | |
| 
 | |
| static struct my_cs_file_section_st sec[] = | |
| { | |
|   {_CS_MISC,		"xml"}, | |
|   {_CS_MISC,		"xml/version"}, | |
|   {_CS_MISC,		"xml/encoding"}, | |
|   {_CS_MISC,		"charsets"}, | |
|   {_CS_MISC,		"charsets/max-id"}, | |
|   {_CS_CHARSET,		"charsets/charset"}, | |
|   {_CS_PRIMARY_ID,	"charsets/charset/primary-id"}, | |
|   {_CS_BINARY_ID,	"charsets/charset/binary-id"}, | |
|   {_CS_CSNAME,		"charsets/charset/name"}, | |
|   {_CS_FAMILY,		"charsets/charset/family"}, | |
|   {_CS_CSDESCRIPT,	"charsets/charset/description"}, | |
|   {_CS_MISC,		"charsets/charset/alias"}, | |
|   {_CS_MISC,		"charsets/charset/ctype"}, | |
|   {_CS_CTYPEMAP,	"charsets/charset/ctype/map"}, | |
|   {_CS_MISC,		"charsets/charset/upper"}, | |
|   {_CS_UPPERMAP,	"charsets/charset/upper/map"}, | |
|   {_CS_MISC,		"charsets/charset/lower"}, | |
|   {_CS_LOWERMAP,	"charsets/charset/lower/map"}, | |
|   {_CS_MISC,		"charsets/charset/unicode"}, | |
|   {_CS_UNIMAP,		"charsets/charset/unicode/map"}, | |
|   {_CS_COLLATION,	"charsets/charset/collation"}, | |
|   {_CS_COLNAME,		"charsets/charset/collation/name"}, | |
|   {_CS_ID,		"charsets/charset/collation/id"}, | |
|   {_CS_ORDER,		"charsets/charset/collation/order"}, | |
|   {_CS_FLAG,		"charsets/charset/collation/flag"}, | |
|   {_CS_COLLMAP,		"charsets/charset/collation/map"}, | |
|   {_CS_RESET,		"charsets/charset/collation/rules/reset"}, | |
|   {_CS_DIFF1,		"charsets/charset/collation/rules/p"}, | |
|   {_CS_DIFF2,		"charsets/charset/collation/rules/s"}, | |
|   {_CS_DIFF3,		"charsets/charset/collation/rules/t"}, | |
|   {0,	NULL} | |
| }; | |
| 
 | |
| static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len) | |
| { | |
|   struct my_cs_file_section_st *s; | |
|   for (s=sec; s->str; s++) | |
|   { | |
|     if (!strncmp(attr,s->str,len)) | |
|       return s; | |
|   } | |
|   return NULL; | |
| } | |
| 
 | |
| #define MY_CS_CSDESCR_SIZE	64 | |
| #define MY_CS_TAILORING_SIZE	1024 | |
|  | |
| typedef struct my_cs_file_info | |
| { | |
|   char   csname[MY_CS_NAME_SIZE]; | |
|   char   name[MY_CS_NAME_SIZE]; | |
|   uchar  ctype[MY_CS_CTYPE_TABLE_SIZE]; | |
|   uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE]; | |
|   uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE]; | |
|   uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; | |
|   uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; | |
|   char   comment[MY_CS_CSDESCR_SIZE]; | |
|   char   tailoring[MY_CS_TAILORING_SIZE]; | |
|   size_t tailoring_length; | |
|   CHARSET_INFO cs; | |
|   int (*add_collation)(CHARSET_INFO *cs); | |
| } MY_CHARSET_LOADER; | |
| 
 | |
| 
 | |
| 
 | |
| static int fill_uchar(uchar *a,uint size,const char *str, size_t len) | |
| { | |
|   uint i= 0; | |
|   const char *s, *b, *e=str+len; | |
|    | |
|   for (s=str ; s < e ; i++) | |
|   {  | |
|     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ; | |
|     b=s; | |
|     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ; | |
|     if (s == b || i > size) | |
|       break; | |
|     a[i]= (uchar) strtoul(b,NULL,16); | |
|   } | |
|   return 0; | |
| } | |
| 
 | |
| static int fill_uint16(uint16 *a,uint size,const char *str, size_t len) | |
| { | |
|   uint i= 0; | |
|    | |
|   const char *s, *b, *e=str+len; | |
|   for (s=str ; s < e ; i++) | |
|   {  | |
|     for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ; | |
|     b=s; | |
|     for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ; | |
|     if (s == b || i > size) | |
|       break; | |
|     a[i]= (uint16) strtol(b,NULL,16); | |
|   } | |
|   return 0; | |
| } | |
| 
 | |
| 
 | |
| static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len) | |
| { | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | |
|   struct my_cs_file_section_st *s= cs_file_sec(attr,len); | |
|    | |
|   if ( s && (s->state == _CS_CHARSET)) | |
|     bzero(&i->cs,sizeof(i->cs)); | |
|    | |
|   if (s && (s->state == _CS_COLLATION)) | |
|     i->tailoring_length= 0; | |
| 
 | |
|   return MY_XML_OK; | |
| } | |
| 
 | |
| 
 | |
| static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len) | |
| { | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | |
|   struct my_cs_file_section_st *s= cs_file_sec(attr,len); | |
|   int    state= s ? s->state : 0; | |
|   int    rc; | |
|    | |
|   switch(state){ | |
|   case _CS_COLLATION: | |
|     rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK; | |
|     break; | |
|   default: | |
|     rc=MY_XML_OK; | |
|   } | |
|   return rc; | |
| } | |
| 
 | |
| 
 | |
| static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len) | |
| { | |
|   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | |
|   struct my_cs_file_section_st *s; | |
|   int    state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state : | |
|                       0); | |
|    | |
|   switch (state) { | |
|   case _CS_ID: | |
|     i->cs.number= strtol(attr,(char**)NULL,10); | |
|     break; | |
|   case _CS_BINARY_ID: | |
|     i->cs.binary_number= strtol(attr,(char**)NULL,10); | |
|     break; | |
|   case _CS_PRIMARY_ID: | |
|     i->cs.primary_number= strtol(attr,(char**)NULL,10); | |
|     break; | |
|   case _CS_COLNAME: | |
|     i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1); | |
|     break; | |
|   case _CS_CSNAME: | |
|     i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1); | |
|     break; | |
|   case _CS_CSDESCRIPT: | |
|     i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1); | |
|     break; | |
|   case _CS_FLAG: | |
|     if (!strncmp("primary",attr,len)) | |
|       i->cs.state|= MY_CS_PRIMARY; | |
|     else if (!strncmp("binary",attr,len)) | |
|       i->cs.state|= MY_CS_BINSORT; | |
|     else if (!strncmp("compiled",attr,len)) | |
|       i->cs.state|= MY_CS_COMPILED; | |
|     break; | |
|   case _CS_UPPERMAP: | |
|     fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len); | |
|     i->cs.to_upper=i->to_upper; | |
|     break; | |
|   case _CS_LOWERMAP: | |
|     fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len); | |
|     i->cs.to_lower=i->to_lower; | |
|     break; | |
|   case _CS_UNIMAP: | |
|     fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len); | |
|     i->cs.tab_to_uni=i->tab_to_uni; | |
|     break; | |
|   case _CS_COLLMAP: | |
|     fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len); | |
|     i->cs.sort_order=i->sort_order; | |
|     break; | |
|   case _CS_CTYPEMAP: | |
|     fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); | |
|     i->cs.ctype=i->ctype; | |
|     break; | |
|   case _CS_RESET: | |
|   case _CS_DIFF1: | |
|   case _CS_DIFF2: | |
|   case _CS_DIFF3: | |
|     { | |
|       /* | |
|         Convert collation description from | |
|         Locale Data Markup Language (LDML) | |
|         into ICU Collation Customization expression. | |
|       */ | |
|       char arg[16]; | |
|       const char *cmd[]= {"&","<","<<","<<<"}; | |
|       i->cs.tailoring= i->tailoring; | |
|       mstr(arg,attr,len,sizeof(arg)-1); | |
|       if (i->tailoring_length + 20 < sizeof(i->tailoring)) | |
|       { | |
|         char *dst= i->tailoring_length + i->tailoring; | |
|         i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg); | |
|       } | |
|     } | |
|   } | |
|   return MY_XML_OK; | |
| } | |
| 
 | |
| 
 | |
| my_bool my_parse_charset_xml(const char *buf, size_t len, | |
|                              int (*add_collation)(CHARSET_INFO *cs)) | |
| { | |
|   MY_XML_PARSER p; | |
|   struct my_cs_file_info i; | |
|   my_bool rc; | |
|    | |
|   my_xml_parser_create(&p); | |
|   my_xml_set_enter_handler(&p,cs_enter); | |
|   my_xml_set_value_handler(&p,cs_value); | |
|   my_xml_set_leave_handler(&p,cs_leave); | |
|   i.add_collation= add_collation; | |
|   my_xml_set_user_data(&p,(void*)&i); | |
|   rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE; | |
|   my_xml_parser_free(&p); | |
|   return rc; | |
| } | |
| 
 | |
| 
 | |
| /* | |
|   Check repertoire: detect pure ascii strings | |
| */ | |
| uint | |
| my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length) | |
| { | |
|   const char *strend= str + length; | |
|   if (cs->mbminlen == 1) | |
|   { | |
|     for ( ; str < strend; str++) | |
|     { | |
|       if (((uchar) *str) > 0x7F) | |
|         return MY_REPERTOIRE_UNICODE30; | |
|     } | |
|   } | |
|   else | |
|   { | |
|     my_wc_t wc; | |
|     int chlen; | |
|     for (; | |
|          (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0; | |
|          str+= chlen) | |
|     { | |
|       if (wc > 0x7F) | |
|         return MY_REPERTOIRE_UNICODE30; | |
|     } | |
|   } | |
|   return MY_REPERTOIRE_ASCII; | |
| } | |
| 
 | |
| 
 | |
| /* | |
|   Returns repertoire for charset | |
| */ | |
| uint my_charset_repertoire(CHARSET_INFO *cs) | |
| { | |
|   return cs->state & MY_CS_PUREASCII ? | |
|     MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; | |
| } | |
| 
 | |
| 
 | |
| /* | |
|   Detect whether a character set is ASCII compatible. | |
|  | |
|   Returns TRUE for: | |
|    | |
|   - all 8bit character sets whose Unicode mapping of 0x7B is '{' | |
|     (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS") | |
|    | |
|   - all multi-byte character sets having mbminlen == 1 | |
|     (ignores ucs2 whose mbminlen is 2) | |
|    | |
|   TODO: | |
|    | |
|   When merging to 5.2, this function should be changed | |
|   to check a new flag MY_CS_NONASCII,  | |
|    | |
|      return (cs->flag & MY_CS_NONASCII) ? 0 : 1; | |
|    | |
|   This flag was previously added into 5.2 under terms | |
|   of WL#3759 "Optimize identifier conversion in client-server protocol" | |
|   especially to mark character sets not compatible with ASCII. | |
|    | |
|   We won't backport this flag to 5.0 or 5.1. | |
|   This function is Ok for 5.0 and 5.1, because we're not going | |
|   to introduce new tricky character sets between 5.0 and 5.2. | |
| */ | |
| my_bool | |
| my_charset_is_ascii_based(CHARSET_INFO *cs) | |
| { | |
|   return  | |
|     (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') || | |
|     (cs->mbminlen == 1 && cs->mbmaxlen > 1); | |
| } | |
| 
 | |
| 
 | |
| /* | |
|   Detect if a character set is 8bit, | |
|   and it is pure ascii, i.e. doesn't have | |
|   characters outside U+0000..U+007F | |
|   This functions is shared between "conf_to_src" | |
|   and dynamic charsets loader in "mysqld". | |
| */ | |
| my_bool | |
| my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs) | |
| { | |
|   size_t code; | |
|   if (!cs->tab_to_uni) | |
|     return 0; | |
|   for (code= 0; code < 256; code++) | |
|   { | |
|     if (cs->tab_to_uni[code] > 0x7F) | |
|       return 0; | |
|   } | |
|   return 1; | |
| } | |
| 
 | |
| 
 | |
| /* | |
|   Shared function between conf_to_src and mysys. | |
|   Check if a 8bit character set is compatible with | |
|   ascii on the range 0x00..0x7F. | |
| */ | |
| my_bool | |
| my_charset_is_ascii_compatible(CHARSET_INFO *cs) | |
| { | |
|   uint i; | |
|   if (!cs->tab_to_uni) | |
|     return 1; | |
|   for (i= 0; i < 128; i++) | |
|   { | |
|     if (cs->tab_to_uni[i] != i) | |
|       return 0; | |
|   } | |
|   return 1; | |
| }
 |