| 1 | /* |
|---|
| 2 | (c) Copyright 2000 convergence integrated media GmbH. |
|---|
| 3 | All rights reserved. |
|---|
| 4 | |
|---|
| 5 | Written by Denis Oliver Kropp <dok@convergence.de>, |
|---|
| 6 | Andreas Hundt <andi@convergence.de> and |
|---|
| 7 | Sven Neumann <sven@convergence.de> |
|---|
| 8 | |
|---|
| 9 | UTF8 routines ported from glib-2.0 |
|---|
| 10 | |
|---|
| 11 | This library is free software; you can redistribute it and/or |
|---|
| 12 | modify it under the terms of the GNU Lesser General Public |
|---|
| 13 | License as published by the Free Software Foundation; either |
|---|
| 14 | version 2 of the License, or (at your option) any later version. |
|---|
| 15 | |
|---|
| 16 | This library is distributed in the hope that it will be useful, |
|---|
| 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|---|
| 19 | Lesser General Public License for more details. |
|---|
| 20 | |
|---|
| 21 | You should have received a copy of the GNU Lesser General Public |
|---|
| 22 | License along with this library; if not, write to the |
|---|
| 23 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|---|
| 24 | Boston, MA 02111-1307, USA. |
|---|
| 25 | */ |
|---|
| 26 | |
|---|
| 27 | #define UTF8_COMPUTE(Char, Mask, Len) \ |
|---|
| 28 | if ((Char & 0xe0) == 0xc0) { \ |
|---|
| 29 | Len = 2; \ |
|---|
| 30 | Mask = 0x1f; \ |
|---|
| 31 | } \ |
|---|
| 32 | else if ((Char & 0xf0) == 0xe0) { \ |
|---|
| 33 | Len = 3; \ |
|---|
| 34 | Mask = 0x0f; \ |
|---|
| 35 | } \ |
|---|
| 36 | else if ((Char & 0xf8) == 0xf0) { \ |
|---|
| 37 | Len = 4; \ |
|---|
| 38 | Mask = 0x07; \ |
|---|
| 39 | } \ |
|---|
| 40 | else if ((Char & 0xfc) == 0xf8) { \ |
|---|
| 41 | Len = 5; \ |
|---|
| 42 | Mask = 0x03; \ |
|---|
| 43 | } \ |
|---|
| 44 | else if ((Char & 0xfe) == 0xfc) { \ |
|---|
| 45 | Len = 6; \ |
|---|
| 46 | Mask = 0x01; \ |
|---|
| 47 | } \ |
|---|
| 48 | else \ |
|---|
| 49 | Len = -1; |
|---|
| 50 | |
|---|
| 51 | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
|---|
| 52 | (Result) = (Chars)[0] & (Mask); \ |
|---|
| 53 | for ((Count) = 1; (Count) < (Len); ++(Count)) { \ |
|---|
| 54 | if (((Chars)[(Count)] & 0xc0) != 0x80) { \ |
|---|
| 55 | (Result) = -1; \ |
|---|
| 56 | break; \ |
|---|
| 57 | } \ |
|---|
| 58 | (Result) <<= 6; \ |
|---|
| 59 | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | /* Actually the last two fields used to be zero. Changed it to 1 |
|---|
| 63 | to avoid endless looping on invalid utf8 strings */ |
|---|
| 64 | char utf8_skip[256] = { |
|---|
| 65 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 66 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 67 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 68 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 69 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 70 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|---|
| 71 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
|---|
| 72 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
|---|
| 73 | }; |
|---|
| 74 | |
|---|
| 75 | unsigned long utf8_get_char (const char *p) |
|---|
| 76 | { |
|---|
| 77 | unsigned long result; |
|---|
| 78 | unsigned char c = (unsigned char) *p; |
|---|
| 79 | |
|---|
| 80 | if (c & 0x80) |
|---|
| 81 | { |
|---|
| 82 | int i, mask = 0, len; |
|---|
| 83 | |
|---|
| 84 | UTF8_COMPUTE (c, mask, len); |
|---|
| 85 | if (len == -1) |
|---|
| 86 | return(unsigned long) -1; |
|---|
| 87 | |
|---|
| 88 | UTF8_GET (result, p, i, mask, len); |
|---|
| 89 | } else |
|---|
| 90 | result = (unsigned long) c; |
|---|
| 91 | |
|---|
| 92 | return result; |
|---|
| 93 | } |
|---|
| 94 | |
|---|