nfkc.c

Go to the documentation of this file.
00001 /* nfkc.c --- Unicode normalization utilities.
00002  * Copyright (C) 2002, 2003, 2004, 2006  Simon Josefsson
00003  *
00004  * This file is part of GNU Libidn.
00005  *
00006  * GNU Libidn is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * GNU Libidn is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with GNU Libidn; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
00019  *
00020  */
00021 
00022 #ifdef HAVE_CONFIG_H
00023 # include "config.h"
00024 #endif
00025 
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 #include "stringprep.h"
00030 
00031 /* This file contains functions from GLIB, including gutf8.c and
00032  * gunidecomp.c, all licensed under LGPL and copyright hold by:
00033  *
00034  *  Copyright (C) 1999, 2000 Tom Tromey
00035  *  Copyright 2000 Red Hat, Inc.
00036  */
00037 
00038 /* Hacks to make syncing with GLIB code easier. */
00039 #define gboolean int
00040 #define gchar char
00041 #define guchar unsigned char
00042 #define glong long
00043 #define gint int
00044 #define guint unsigned int
00045 #define gushort unsigned short
00046 #define gint16 int16_t
00047 #define guint16 uint16_t
00048 #define gunichar uint32_t
00049 #define gsize size_t
00050 #define gssize ssize_t
00051 #define g_malloc malloc
00052 #define g_free free
00053 #define GError void
00054 #define g_set_error(a,b,c,d) ((void) 0)
00055 #define g_new(struct_type, n_structs)                                   \
00056   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
00057 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
00058 #    define G_STMT_START        (void)(
00059 #    define G_STMT_END          )
00060 #  else
00061 #    if (defined (sun) || defined (__sun__))
00062 #      define G_STMT_START      if (1)
00063 #      define G_STMT_END        else (void)0
00064 #    else
00065 #      define G_STMT_START      do
00066 #      define G_STMT_END        while (0)
00067 #    endif
00068 #  endif
00069 #define g_return_val_if_fail(expr,val)          G_STMT_START{ (void)0; }G_STMT_END
00070 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
00071 #define TRUE 1
00072 #define FALSE 0
00073 
00074 /* Code from GLIB gunicode.h starts here. */
00075 
00076 typedef enum
00077 {
00078   G_NORMALIZE_DEFAULT,
00079   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
00080   G_NORMALIZE_DEFAULT_COMPOSE,
00081   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
00082   G_NORMALIZE_ALL,
00083   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
00084   G_NORMALIZE_ALL_COMPOSE,
00085   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
00086 }
00087 GNormalizeMode;
00088 
00089 /* Code from GLIB gutf8.c starts here. */
00090 
00091 #define UTF8_COMPUTE(Char, Mask, Len)           \
00092   if (Char < 128)                               \
00093     {                                           \
00094       Len = 1;                                  \
00095       Mask = 0x7f;                              \
00096     }                                           \
00097   else if ((Char & 0xe0) == 0xc0)               \
00098     {                                           \
00099       Len = 2;                                  \
00100       Mask = 0x1f;                              \
00101     }                                           \
00102   else if ((Char & 0xf0) == 0xe0)               \
00103     {                                           \
00104       Len = 3;                                  \
00105       Mask = 0x0f;                              \
00106     }                                           \
00107   else if ((Char & 0xf8) == 0xf0)               \
00108     {                                           \
00109       Len = 4;                                  \
00110       Mask = 0x07;                              \
00111     }                                           \
00112   else if ((Char & 0xfc) == 0xf8)               \
00113     {                                           \
00114       Len = 5;                                  \
00115       Mask = 0x03;                              \
00116     }                                           \
00117   else if ((Char & 0xfe) == 0xfc)               \
00118     {                                           \
00119       Len = 6;                                  \
00120       Mask = 0x01;                              \
00121     }                                           \
00122   else                                          \
00123     Len = -1;
00124 
00125 #define UTF8_LENGTH(Char)                       \
00126   ((Char) < 0x80 ? 1 :                          \
00127    ((Char) < 0x800 ? 2 :                        \
00128     ((Char) < 0x10000 ? 3 :                     \
00129      ((Char) < 0x200000 ? 4 :                   \
00130       ((Char) < 0x4000000 ? 5 : 6)))))
00131 
00132 
00133 #define UTF8_GET(Result, Chars, Count, Mask, Len)       \
00134   (Result) = (Chars)[0] & (Mask);                       \
00135   for ((Count) = 1; (Count) < (Len); ++(Count))         \
00136     {                                                   \
00137       if (((Chars)[(Count)] & 0xc0) != 0x80)            \
00138         {                                               \
00139           (Result) = -1;                                \
00140           break;                                        \
00141         }                                               \
00142       (Result) <<= 6;                                   \
00143       (Result) |= ((Chars)[(Count)] & 0x3f);            \
00144     }
00145 
00146 #define UNICODE_VALID(Char)                     \
00147   ((Char) < 0x110000 &&                         \
00148    (((Char) & 0xFFFFF800) != 0xD800) &&         \
00149    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&      \
00150    ((Char) & 0xFFFE) != 0xFFFE)
00151 
00152 
00153 static const gchar utf8_skip_data[256] = {
00154   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00155   1, 1, 1, 1, 1, 1, 1,
00156   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00157   1, 1, 1, 1, 1, 1, 1,
00158   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00159   1, 1, 1, 1, 1, 1, 1,
00160   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00161   1, 1, 1, 1, 1, 1, 1,
00162   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00163   1, 1, 1, 1, 1, 1, 1,
00164   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00165   1, 1, 1, 1, 1, 1, 1,
00166   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00167   2, 2, 2, 2, 2, 2, 2,
00168   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
00169   5, 5, 5, 6, 6, 1, 1
00170 };
00171 
00172 static const gchar *const g_utf8_skip = utf8_skip_data;
00173 
00174 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
00175 
00176 /*
00177  * g_utf8_strlen:
00178  * @p: pointer to the start of a UTF-8 encoded string.
00179  * @max: the maximum number of bytes to examine. If @max
00180  *       is less than 0, then the string is assumed to be
00181  *       nul-terminated. If @max is 0, @p will not be examined and
00182  *       may be %NULL.
00183  *
00184  * Returns the length of the string in characters.
00185  *
00186  * Return value: the length of the string in characters
00187  **/
00188 static glong
00189 g_utf8_strlen (const gchar * p, gssize max)
00190 {
00191   glong len = 0;
00192   const gchar *start = p;
00193   g_return_val_if_fail (p != NULL || max == 0, 0);
00194 
00195   if (max < 0)
00196     {
00197       while (*p)
00198         {
00199           p = g_utf8_next_char (p);
00200           ++len;
00201         }
00202     }
00203   else
00204     {
00205       if (max == 0 || !*p)
00206         return 0;
00207 
00208       p = g_utf8_next_char (p);
00209 
00210       while (p - start < max && *p)
00211         {
00212           ++len;
00213           p = g_utf8_next_char (p);
00214         }
00215 
00216       /* only do the last len increment if we got a complete
00217        * char (don't count partial chars)
00218        */
00219       if (p - start == max)
00220         ++len;
00221     }
00222 
00223   return len;
00224 }
00225 
00226 /*
00227  * g_utf8_get_char:
00228  * @p: a pointer to Unicode character encoded as UTF-8
00229  *
00230  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
00231  * If @p does not point to a valid UTF-8 encoded character, results are
00232  * undefined. If you are not sure that the bytes are complete
00233  * valid Unicode characters, you should use g_utf8_get_char_validated()
00234  * instead.
00235  *
00236  * Return value: the resulting character
00237  **/
00238 static gunichar
00239 g_utf8_get_char (const gchar * p)
00240 {
00241   int i, mask = 0, len;
00242   gunichar result;
00243   unsigned char c = (unsigned char) *p;
00244 
00245   UTF8_COMPUTE (c, mask, len);
00246   if (len == -1)
00247     return (gunichar) - 1;
00248   UTF8_GET (result, p, i, mask, len);
00249 
00250   return result;
00251 }
00252 
00253 /*
00254  * g_unichar_to_utf8:
00255  * @c: a ISO10646 character code
00256  * @outbuf: output buffer, must have at least 6 bytes of space.
00257  *       If %NULL, the length will be computed and returned
00258  *       and nothing will be written to @outbuf.
00259  *
00260  * Converts a single character to UTF-8.
00261  *
00262  * Return value: number of bytes written
00263  **/
00264 static int
00265 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
00266 {
00267   guint len = 0;
00268   int first;
00269   int i;
00270 
00271   if (c < 0x80)
00272     {
00273       first = 0;
00274       len = 1;
00275     }
00276   else if (c < 0x800)
00277     {
00278       first = 0xc0;
00279       len = 2;
00280     }
00281   else if (c < 0x10000)
00282     {
00283       first = 0xe0;
00284       len = 3;
00285     }
00286   else if (c < 0x200000)
00287     {
00288       first = 0xf0;
00289       len = 4;
00290     }
00291   else if (c < 0x4000000)
00292     {
00293       first = 0xf8;
00294       len = 5;
00295     }
00296   else
00297     {
00298       first = 0xfc;
00299       len = 6;
00300     }
00301 
00302   if (outbuf)
00303     {
00304       for (i = len - 1; i > 0; --i)
00305         {
00306           outbuf[i] = (c & 0x3f) | 0x80;
00307           c >>= 6;
00308         }
00309       outbuf[0] = c | first;
00310     }
00311 
00312   return len;
00313 }
00314 
00315 /*
00316  * g_utf8_to_ucs4_fast:
00317  * @str: a UTF-8 encoded string
00318  * @len: the maximum length of @str to use. If @len < 0, then
00319  *       the string is nul-terminated.
00320  * @items_written: location to store the number of characters in the
00321  *                 result, or %NULL.
00322  *
00323  * Convert a string from UTF-8 to a 32-bit fixed width
00324  * representation as UCS-4, assuming valid UTF-8 input.
00325  * This function is roughly twice as fast as g_utf8_to_ucs4()
00326  * but does no error checking on the input.
00327  *
00328  * Return value: a pointer to a newly allocated UCS-4 string.
00329  *               This value must be freed with g_free().
00330  **/
00331 static gunichar *
00332 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
00333 {
00334   gint j, charlen;
00335   gunichar *result;
00336   gint n_chars, i;
00337   const gchar *p;
00338 
00339   g_return_val_if_fail (str != NULL, NULL);
00340 
00341   p = str;
00342   n_chars = 0;
00343   if (len < 0)
00344     {
00345       while (*p)
00346         {
00347           p = g_utf8_next_char (p);
00348           ++n_chars;
00349         }
00350     }
00351   else
00352     {
00353       while (p < str + len && *p)
00354         {
00355           p = g_utf8_next_char (p);
00356           ++n_chars;
00357         }
00358     }
00359 
00360   result = g_new (gunichar, n_chars + 1);
00361   if (!result)
00362     return NULL;
00363 
00364   p = str;
00365   for (i = 0; i < n_chars; i++)
00366     {
00367       gunichar wc = ((unsigned char *) p)[0];
00368 
00369       if (wc < 0x80)
00370         {
00371           result[i] = wc;
00372           p++;
00373         }
00374       else
00375         {
00376           if (wc < 0xe0)
00377             {
00378               charlen = 2;
00379               wc &= 0x1f;
00380             }
00381           else if (wc < 0xf0)
00382             {
00383               charlen = 3;
00384               wc &= 0x0f;
00385             }
00386           else if (wc < 0xf8)
00387             {
00388               charlen = 4;
00389               wc &= 0x07;
00390             }
00391           else if (wc < 0xfc)
00392             {
00393               charlen = 5;
00394               wc &= 0x03;
00395             }
00396           else
00397             {
00398               charlen = 6;
00399               wc &= 0x01;
00400             }
00401 
00402           for (j = 1; j < charlen; j++)
00403             {
00404               wc <<= 6;
00405               wc |= ((unsigned char *) p)[j] & 0x3f;
00406             }
00407 
00408           result[i] = wc;
00409           p += charlen;
00410         }
00411     }
00412   result[i] = 0;
00413 
00414   if (items_written)
00415     *items_written = i;
00416 
00417   return result;
00418 }
00419 
00420 /*
00421  * g_ucs4_to_utf8:
00422  * @str: a UCS-4 encoded string
00423  * @len: the maximum length of @str to use. If @len < 0, then
00424  *       the string is terminated with a 0 character.
00425  * @items_read: location to store number of characters read read, or %NULL.
00426  * @items_written: location to store number of bytes written or %NULL.
00427  *                 The value here stored does not include the trailing 0
00428  *                 byte.
00429  * @error: location to store the error occuring, or %NULL to ignore
00430  *         errors. Any of the errors in #GConvertError other than
00431  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
00432  *
00433  * Convert a string from a 32-bit fixed width representation as UCS-4.
00434  * to UTF-8. The result will be terminated with a 0 byte.
00435  *
00436  * Return value: a pointer to a newly allocated UTF-8 string.
00437  *               This value must be freed with g_free(). If an
00438  *               error occurs, %NULL will be returned and
00439  *               @error set.
00440  **/
00441 static gchar *
00442 g_ucs4_to_utf8 (const gunichar * str,
00443                 glong len,
00444                 glong * items_read, glong * items_written, GError ** error)
00445 {
00446   gint result_length;
00447   gchar *result = NULL;
00448   gchar *p;
00449   gint i;
00450 
00451   result_length = 0;
00452   for (i = 0; len < 0 || i < len; i++)
00453     {
00454       if (!str[i])
00455         break;
00456 
00457       if (str[i] >= 0x80000000)
00458         {
00459           if (items_read)
00460             *items_read = i;
00461 
00462           g_set_error (error, G_CONVERT_ERROR,
00463                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
00464                        _("Character out of range for UTF-8"));
00465           goto err_out;
00466         }
00467 
00468       result_length += UTF8_LENGTH (str[i]);
00469     }
00470 
00471   result = g_malloc (result_length + 1);
00472   if (!result)
00473     return NULL;
00474   p = result;
00475 
00476   i = 0;
00477   while (p < result + result_length)
00478     p += g_unichar_to_utf8 (str[i++], p);
00479 
00480   *p = '\0';
00481 
00482   if (items_written)
00483     *items_written = p - result;
00484 
00485 err_out:
00486   if (items_read)
00487     *items_read = i;
00488 
00489   return result;
00490 }
00491 
00492 /* Code from GLIB gunidecomp.c starts here. */
00493 
00494 #include "gunidecomp.h"
00495 #include "gunicomp.h"
00496 
00497 #define CC_PART1(Page, Char) \
00498   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
00499    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
00500    : (cclass_data[combining_class_table_part1[Page]][Char]))
00501 
00502 #define CC_PART2(Page, Char) \
00503   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
00504    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
00505    : (cclass_data[combining_class_table_part2[Page]][Char]))
00506 
00507 #define COMBINING_CLASS(Char) \
00508   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
00509    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
00510    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
00511       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
00512       : 0))
00513 
00514 /* constants for hangul syllable [de]composition */
00515 #define SBase 0xAC00
00516 #define LBase 0x1100
00517 #define VBase 0x1161
00518 #define TBase 0x11A7
00519 #define LCount 19
00520 #define VCount 21
00521 #define TCount 28
00522 #define NCount (VCount * TCount)
00523 #define SCount (LCount * NCount)
00524 
00525 /*
00526  * g_unicode_canonical_ordering:
00527  * @string: a UCS-4 encoded string.
00528  * @len: the maximum length of @string to use.
00529  *
00530  * Computes the canonical ordering of a string in-place.
00531  * This rearranges decomposed characters in the string
00532  * according to their combining classes.  See the Unicode
00533  * manual for more information.
00534  **/
00535 static void
00536 g_unicode_canonical_ordering (gunichar * string, gsize len)
00537 {
00538   gsize i;
00539   int swap = 1;
00540 
00541   while (swap)
00542     {
00543       int last;
00544       swap = 0;
00545       last = COMBINING_CLASS (string[0]);
00546       for (i = 0; i < len - 1; ++i)
00547         {
00548           int next = COMBINING_CLASS (string[i + 1]);
00549           if (next != 0 && last > next)
00550             {
00551               gsize j;
00552               /* Percolate item leftward through string.  */
00553               for (j = i + 1; j > 0; --j)
00554                 {
00555                   gunichar t;
00556                   if (COMBINING_CLASS (string[j - 1]) <= next)
00557                     break;
00558                   t = string[j];
00559                   string[j] = string[j - 1];
00560                   string[j - 1] = t;
00561                   swap = 1;
00562                 }
00563               /* We're re-entering the loop looking at the old
00564                  character again.  */
00565               next = last;
00566             }
00567           last = next;
00568         }
00569     }
00570 }
00571 
00572 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
00573  * r should be null or have sufficient space. Calling with r == NULL will
00574  * only calculate the result_len; however, a buffer with space for three
00575  * characters will always be big enough. */
00576 static void
00577 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
00578 {
00579   gint SIndex = s - SBase;
00580 
00581   /* not a hangul syllable */
00582   if (SIndex < 0 || SIndex >= SCount)
00583     {
00584       if (r)
00585         r[0] = s;
00586       *result_len = 1;
00587     }
00588   else
00589     {
00590       gunichar L = LBase + SIndex / NCount;
00591       gunichar V = VBase + (SIndex % NCount) / TCount;
00592       gunichar T = TBase + SIndex % TCount;
00593 
00594       if (r)
00595         {
00596           r[0] = L;
00597           r[1] = V;
00598         }
00599 
00600       if (T != TBase)
00601         {
00602           if (r)
00603             r[2] = T;
00604           *result_len = 3;
00605         }
00606       else
00607         *result_len = 2;
00608     }
00609 }
00610 
00611 /* returns a pointer to a null-terminated UTF-8 string */
00612 static const gchar *
00613 find_decomposition (gunichar ch, gboolean compat)
00614 {
00615   int start = 0;
00616   int end = G_N_ELEMENTS (decomp_table);
00617 
00618   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
00619     {
00620       while (TRUE)
00621         {
00622           int half = (start + end) / 2;
00623           if (ch == decomp_table[half].ch)
00624             {
00625               int offset;
00626 
00627               if (compat)
00628                 {
00629                   offset = decomp_table[half].compat_offset;
00630                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
00631                     offset = decomp_table[half].canon_offset;
00632                 }
00633               else
00634                 {
00635                   offset = decomp_table[half].canon_offset;
00636                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
00637                     return NULL;
00638                 }
00639 
00640               return &(decomp_expansion_string[offset]);
00641             }
00642           else if (half == start)
00643             break;
00644           else if (ch > decomp_table[half].ch)
00645             start = half;
00646           else
00647             end = half;
00648         }
00649     }
00650 
00651   return NULL;
00652 }
00653 
00654 /* L,V => LV and LV,T => LVT  */
00655 static gboolean
00656 combine_hangul (gunichar a, gunichar b, gunichar * result)
00657 {
00658   gint LIndex = a - LBase;
00659   gint SIndex = a - SBase;
00660 
00661   gint VIndex = b - VBase;
00662   gint TIndex = b - TBase;
00663 
00664   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
00665     {
00666       *result = SBase + (LIndex * VCount + VIndex) * TCount;
00667       return TRUE;
00668     }
00669   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
00670            && 0 <= TIndex && TIndex <= TCount)
00671     {
00672       *result = a + TIndex;
00673       return TRUE;
00674     }
00675 
00676   return FALSE;
00677 }
00678 
00679 #define CI(Page, Char) \
00680   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
00681    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
00682    : (compose_data[compose_table[Page]][Char]))
00683 
00684 #define COMPOSE_INDEX(Char) \
00685      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
00686 
00687 static gboolean
00688 combine (gunichar a, gunichar b, gunichar * result)
00689 {
00690   gushort index_a, index_b;
00691 
00692   if (combine_hangul (a, b, result))
00693     return TRUE;
00694 
00695   index_a = COMPOSE_INDEX (a);
00696 
00697   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
00698     {
00699       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
00700         {
00701           *result =
00702             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
00703           return TRUE;
00704         }
00705       else
00706         return FALSE;
00707     }
00708 
00709   index_b = COMPOSE_INDEX (b);
00710 
00711   if (index_b >= COMPOSE_SECOND_SINGLE_START)
00712     {
00713       if (a ==
00714           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
00715         {
00716           *result =
00717             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
00718           return TRUE;
00719         }
00720       else
00721         return FALSE;
00722     }
00723 
00724   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
00725       && index_b >= COMPOSE_SECOND_START
00726       && index_b < COMPOSE_SECOND_SINGLE_START)
00727     {
00728       gunichar res =
00729         compose_array[index_a - COMPOSE_FIRST_START][index_b -
00730                                                      COMPOSE_SECOND_START];
00731 
00732       if (res)
00733         {
00734           *result = res;
00735           return TRUE;
00736         }
00737     }
00738 
00739   return FALSE;
00740 }
00741 
00742 static gunichar *
00743 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
00744 {
00745   gsize n_wc;
00746   gunichar *wc_buffer;
00747   const char *p;
00748   gsize last_start;
00749   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
00750   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
00751 
00752   n_wc = 0;
00753   p = str;
00754   while ((max_len < 0 || p < str + max_len) && *p)
00755     {
00756       const gchar *decomp;
00757       gunichar wc = g_utf8_get_char (p);
00758 
00759       if (wc >= 0xac00 && wc <= 0xd7af)
00760         {
00761           gsize result_len;
00762           decompose_hangul (wc, NULL, &result_len);
00763           n_wc += result_len;
00764         }
00765       else
00766         {
00767           decomp = find_decomposition (wc, do_compat);
00768 
00769           if (decomp)
00770             n_wc += g_utf8_strlen (decomp, -1);
00771           else
00772             n_wc++;
00773         }
00774 
00775       p = g_utf8_next_char (p);
00776     }
00777 
00778   wc_buffer = g_new (gunichar, n_wc + 1);
00779   if (!wc_buffer)
00780     return NULL;
00781 
00782   last_start = 0;
00783   n_wc = 0;
00784   p = str;
00785   while ((max_len < 0 || p < str + max_len) && *p)
00786     {
00787       gunichar wc = g_utf8_get_char (p);
00788       const gchar *decomp;
00789       int cc;
00790       gsize old_n_wc = n_wc;
00791 
00792       if (wc >= 0xac00 && wc <= 0xd7af)
00793         {
00794           gsize result_len;
00795           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
00796           n_wc += result_len;
00797         }
00798       else
00799         {
00800           decomp = find_decomposition (wc, do_compat);
00801 
00802           if (decomp)
00803             {
00804               const char *pd;
00805               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
00806                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
00807             }
00808           else
00809             wc_buffer[n_wc++] = wc;
00810         }
00811 
00812       if (n_wc > 0)
00813         {
00814           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
00815 
00816           if (cc == 0)
00817             {
00818               g_unicode_canonical_ordering (wc_buffer + last_start,
00819                                             n_wc - last_start);
00820               last_start = old_n_wc;
00821             }
00822         }
00823 
00824       p = g_utf8_next_char (p);
00825     }
00826 
00827   if (n_wc > 0)
00828     {
00829       g_unicode_canonical_ordering (wc_buffer + last_start,
00830                                     n_wc - last_start);
00831       last_start = n_wc;
00832     }
00833 
00834   wc_buffer[n_wc] = 0;
00835 
00836   /* All decomposed and reordered */
00837 
00838   if (do_compose && n_wc > 0)
00839     {
00840       gsize i, j;
00841       int last_cc = 0;
00842       last_start = 0;
00843 
00844       for (i = 0; i < n_wc; i++)
00845         {
00846           int cc = COMBINING_CLASS (wc_buffer[i]);
00847 
00848           if (i > 0 &&
00849               (last_cc == 0 || last_cc != cc) &&
00850               combine (wc_buffer[last_start], wc_buffer[i],
00851                        &wc_buffer[last_start]))
00852             {
00853               for (j = i + 1; j < n_wc; j++)
00854                 wc_buffer[j - 1] = wc_buffer[j];
00855               n_wc--;
00856               i--;
00857 
00858               if (i == last_start)
00859                 last_cc = 0;
00860               else
00861                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
00862 
00863               continue;
00864             }
00865 
00866           if (cc == 0)
00867             last_start = i;
00868 
00869           last_cc = cc;
00870         }
00871     }
00872 
00873   wc_buffer[n_wc] = 0;
00874 
00875   return wc_buffer;
00876 }
00877 
00878 /*
00879  * g_utf8_normalize:
00880  * @str: a UTF-8 encoded string.
00881  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
00882  * @mode: the type of normalization to perform.
00883  *
00884  * Converts a string into canonical form, standardizing
00885  * such issues as whether a character with an accent
00886  * is represented as a base character and combining
00887  * accent or as a single precomposed character. You
00888  * should generally call g_utf8_normalize() before
00889  * comparing two Unicode strings.
00890  *
00891  * The normalization mode %G_NORMALIZE_DEFAULT only
00892  * standardizes differences that do not affect the
00893  * text content, such as the above-mentioned accent
00894  * representation. %G_NORMALIZE_ALL also standardizes
00895  * the "compatibility" characters in Unicode, such
00896  * as SUPERSCRIPT THREE to the standard forms
00897  * (in this case DIGIT THREE). Formatting information
00898  * may be lost but for most text operations such
00899  * characters should be considered the same.
00900  * For example, g_utf8_collate() normalizes
00901  * with %G_NORMALIZE_ALL as its first step.
00902  *
00903  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
00904  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
00905  * but returned a result with composed forms rather
00906  * than a maximally decomposed form. This is often
00907  * useful if you intend to convert the string to
00908  * a legacy encoding or pass it to a system with
00909  * less capable Unicode handling.
00910  *
00911  * Return value: a newly allocated string, that is the
00912  *   normalized form of @str.
00913  **/
00914 static gchar *
00915 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
00916 {
00917   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
00918   gchar *result;
00919 
00920   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
00921   g_free (result_wc);
00922 
00923   return result;
00924 }
00925 
00926 /* Public Libidn API starts here. */
00927 
00938 uint32_t
00939 stringprep_utf8_to_unichar (const char *p)
00940 {
00941   return g_utf8_get_char (p);
00942 }
00943 
00955 int
00956 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
00957 {
00958   return g_unichar_to_utf8 (c, outbuf);
00959 }
00960 
00976 uint32_t *
00977 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
00978 {
00979   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
00980 }
00981 
01000 char *
01001 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
01002                          size_t * items_read, size_t * items_written)
01003 {
01004   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
01005                          (glong *) items_written, NULL);
01006 }
01007 
01030 char *
01031 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
01032 {
01033   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
01034 }
01035 
01047 uint32_t *
01048 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
01049 {
01050   char *p;
01051   uint32_t *result_wc;
01052 
01053   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
01054   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
01055   free (p);
01056 
01057   return result_wc;
01058 }

Generated on Wed Sep 13 10:20:31 2006 for libidn by  doxygen 1.4.7