michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2000-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * file name: ucnvisci.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2001JUN26 michael@0: * created by: Ram Viswanadha michael@0: * michael@0: * Date Name Description michael@0: * 24/7/2001 Ram Added support for EXT character handling michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION michael@0: michael@0: #include "unicode/ucnv.h" michael@0: #include "unicode/ucnv_cb.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "ucnv_bld.h" michael@0: #include "ucnv_cnv.h" michael@0: #include "cstring.h" michael@0: #include "uassert.h" michael@0: michael@0: #define UCNV_OPTIONS_VERSION_MASK 0xf michael@0: #define NUKTA 0x093c michael@0: #define HALANT 0x094d michael@0: #define ZWNJ 0x200c /* Zero Width Non Joiner */ michael@0: #define ZWJ 0x200d /* Zero width Joiner */ michael@0: #define INVALID_CHAR 0xffff michael@0: #define ATR 0xEF /* Attribute code */ michael@0: #define EXT 0xF0 /* Extension code */ michael@0: #define DANDA 0x0964 michael@0: #define DOUBLE_DANDA 0x0965 michael@0: #define ISCII_NUKTA 0xE9 michael@0: #define ISCII_HALANT 0xE8 michael@0: #define ISCII_DANDA 0xEA michael@0: #define ISCII_INV 0xD9 michael@0: #define ISCII_VOWEL_SIGN_E 0xE0 michael@0: #define INDIC_BLOCK_BEGIN 0x0900 michael@0: #define INDIC_BLOCK_END 0x0D7F michael@0: #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN) michael@0: #define VOCALLIC_RR 0x0931 michael@0: #define LF 0x0A michael@0: #define ASCII_END 0xA0 michael@0: #define NO_CHAR_MARKER 0xFFFE michael@0: #define TELUGU_DELTA DELTA * TELUGU michael@0: #define DEV_ABBR_SIGN 0x0970 michael@0: #define DEV_ANUDATTA 0x0952 michael@0: #define EXT_RANGE_BEGIN 0xA1 michael@0: #define EXT_RANGE_END 0xEE michael@0: michael@0: #define PNJ_DELTA 0x0100 michael@0: #define PNJ_BINDI 0x0A02 michael@0: #define PNJ_TIPPI 0x0A70 michael@0: #define PNJ_SIGN_VIRAMA 0x0A4D michael@0: #define PNJ_ADHAK 0x0A71 michael@0: #define PNJ_HA 0x0A39 michael@0: #define PNJ_RRA 0x0A5C michael@0: michael@0: typedef enum { michael@0: DEVANAGARI =0, michael@0: BENGALI, michael@0: GURMUKHI, michael@0: GUJARATI, michael@0: ORIYA, michael@0: TAMIL, michael@0: TELUGU, michael@0: KANNADA, michael@0: MALAYALAM, michael@0: DELTA=0x80 michael@0: }UniLang; michael@0: michael@0: /** michael@0: * Enumeration for switching code pages if + michael@0: * is encountered michael@0: */ michael@0: typedef enum { michael@0: DEF = 0x40, michael@0: RMN = 0x41, michael@0: DEV = 0x42, michael@0: BNG = 0x43, michael@0: TML = 0x44, michael@0: TLG = 0x45, michael@0: ASM = 0x46, michael@0: ORI = 0x47, michael@0: KND = 0x48, michael@0: MLM = 0x49, michael@0: GJR = 0x4A, michael@0: PNJ = 0x4B, michael@0: ARB = 0x71, michael@0: PES = 0x72, michael@0: URD = 0x73, michael@0: SND = 0x74, michael@0: KSM = 0x75, michael@0: PST = 0x76 michael@0: }ISCIILang; michael@0: michael@0: typedef enum { michael@0: DEV_MASK =0x80, michael@0: PNJ_MASK =0x40, michael@0: GJR_MASK =0x20, michael@0: ORI_MASK =0x10, michael@0: BNG_MASK =0x08, michael@0: KND_MASK =0x04, michael@0: MLM_MASK =0x02, michael@0: TML_MASK =0x01, michael@0: ZERO =0x00 michael@0: }MaskEnum; michael@0: michael@0: #define ISCII_CNV_PREFIX "ISCII,version=" michael@0: michael@0: typedef struct { michael@0: UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */ michael@0: UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */ michael@0: uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */ michael@0: uint16_t currentDeltaFromUnicode; /* current delta in Indic block */ michael@0: uint16_t currentDeltaToUnicode; /* current delta in Indic block */ michael@0: MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */ michael@0: MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */ michael@0: MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */ michael@0: UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */ michael@0: UBool resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered*/ michael@0: char name[sizeof(ISCII_CNV_PREFIX) + 1]; michael@0: UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */ michael@0: } UConverterDataISCII; michael@0: michael@0: typedef struct LookupDataStruct { michael@0: UniLang uniLang; michael@0: MaskEnum maskEnum; michael@0: ISCIILang isciiLang; michael@0: } LookupDataStruct; michael@0: michael@0: static const LookupDataStruct lookupInitialData[]={ michael@0: { DEVANAGARI, DEV_MASK, DEV }, michael@0: { BENGALI, BNG_MASK, BNG }, michael@0: { GURMUKHI, PNJ_MASK, PNJ }, michael@0: { GUJARATI, GJR_MASK, GJR }, michael@0: { ORIYA, ORI_MASK, ORI }, michael@0: { TAMIL, TML_MASK, TML }, michael@0: { TELUGU, KND_MASK, TLG }, michael@0: { KANNADA, KND_MASK, KND }, michael@0: { MALAYALAM, MLM_MASK, MLM } michael@0: }; michael@0: michael@0: /* michael@0: * For special handling of certain Gurmukhi characters. michael@0: * Bit 0 (value 1): PNJ consonant michael@0: * Bit 1 (value 2): PNJ Bindi Tippi michael@0: */ michael@0: static const uint8_t pnjMap[80] = { michael@0: /* 0A00..0A0F */ michael@0: 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /* 0A10..0A1F */ michael@0: 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, michael@0: /* 0A20..0A2F */ michael@0: 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, michael@0: /* 0A30..0A3F */ michael@0: 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2, michael@0: /* 0A40..0A4F */ michael@0: 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 michael@0: }; michael@0: michael@0: static UBool michael@0: isPNJConsonant(UChar32 c) { michael@0: if (c < 0xa00 || 0xa50 <= c) { michael@0: return FALSE; michael@0: } else { michael@0: return (UBool)(pnjMap[c - 0xa00] & 1); michael@0: } michael@0: } michael@0: michael@0: static UBool michael@0: isPNJBindiTippi(UChar32 c) { michael@0: if (c < 0xa00 || 0xa50 <= c) { michael@0: return FALSE; michael@0: } else { michael@0: return (UBool)(pnjMap[c - 0xa00] >> 1); michael@0: } michael@0: } michael@0: michael@0: static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) { michael@0: if(pArgs->onlyTestIsLoadable) { michael@0: return; michael@0: } michael@0: michael@0: cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII)); michael@0: michael@0: if (cnv->extraInfo != NULL) { michael@0: int32_t len=0; michael@0: UConverterDataISCII *converterData= michael@0: (UConverterDataISCII *) cnv->extraInfo; michael@0: converterData->contextCharToUnicode=NO_CHAR_MARKER; michael@0: cnv->toUnicodeStatus = missingCharMarker; michael@0: converterData->contextCharFromUnicode=0x0000; michael@0: converterData->resetToDefaultToUnicode=FALSE; michael@0: /* check if the version requested is supported */ michael@0: if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) { michael@0: /* initialize state variables */ michael@0: converterData->currentDeltaFromUnicode michael@0: = converterData->currentDeltaToUnicode michael@0: = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA); michael@0: michael@0: converterData->currentMaskFromUnicode michael@0: = converterData->currentMaskToUnicode michael@0: = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum; michael@0: michael@0: converterData->isFirstBuffer=TRUE; michael@0: (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX); michael@0: len = (int32_t)uprv_strlen(converterData->name); michael@0: converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0'); michael@0: converterData->name[len+1]=0; michael@0: michael@0: converterData->prevToUnicodeStatus = 0x0000; michael@0: } else { michael@0: uprv_free(cnv->extraInfo); michael@0: cnv->extraInfo = NULL; michael@0: *errorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: michael@0: } else { michael@0: *errorCode =U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: static void _ISCIIClose(UConverter *cnv) { michael@0: if (cnv->extraInfo!=NULL) { michael@0: if (!cnv->isExtraLocal) { michael@0: uprv_free(cnv->extraInfo); michael@0: } michael@0: cnv->extraInfo=NULL; michael@0: } michael@0: } michael@0: michael@0: static const char* _ISCIIgetName(const UConverter* cnv) { michael@0: if (cnv->extraInfo) { michael@0: UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo; michael@0: return myData->name; michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: static void _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) { michael@0: UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo); michael@0: if (choice<=UCNV_RESET_TO_UNICODE) { michael@0: cnv->toUnicodeStatus = missingCharMarker; michael@0: cnv->mode=0; michael@0: data->currentDeltaToUnicode=data->defDeltaToUnicode; michael@0: data->currentMaskToUnicode = data->defMaskToUnicode; michael@0: data->contextCharToUnicode=NO_CHAR_MARKER; michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: if (choice!=UCNV_RESET_TO_UNICODE) { michael@0: cnv->fromUChar32=0x0000; michael@0: data->contextCharFromUnicode=0x00; michael@0: data->currentMaskFromUnicode=data->defMaskToUnicode; michael@0: data->currentDeltaFromUnicode=data->defDeltaToUnicode; michael@0: data->isFirstBuffer=TRUE; michael@0: data->resetToDefaultToUnicode=FALSE; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * The values in validity table are indexed by the lower bits of Unicode michael@0: * range 0x0900 - 0x09ff. The values have a structure like: michael@0: * --------------------------------------------------------------- michael@0: * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML | michael@0: * | | | | | ASM | KND | | | michael@0: * --------------------------------------------------------------- michael@0: * If a code point is valid in a particular script michael@0: * then that bit is turned on michael@0: * michael@0: * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for michael@0: * to represent these languages michael@0: * michael@0: * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case michael@0: * and combine and use 1 bit to represent these languages. michael@0: * michael@0: * TODO: It is probably easier to understand and maintain to change this michael@0: * to use uint16_t and give each of the 9 Unicode/script blocks its own bit. michael@0: */ michael@0: michael@0: static const uint8_t validityTable[128] = { michael@0: /* This state table is tool generated please do not edit unless you know exactly what you are doing */ michael@0: /* Note: This table was edited to mirror the Windows XP implementation */ michael@0: /*ISCII:Valid:Unicode */ michael@0: /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK , michael@0: /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , michael@0: /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , michael@0: /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO , michael@0: /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO , michael@0: /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , michael@0: /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , michael@0: /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , michael@0: /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , michael@0: /* michael@0: * The length of the array is 128 to provide values for 0x900..0x97f. michael@0: * The last 15 entries for 0x971..0x97f of the validity table are all zero michael@0: * because no Indic script uses such Unicode code points. michael@0: */ michael@0: /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO michael@0: }; michael@0: michael@0: static const uint16_t fromUnicodeTable[128]={ michael@0: 0x00a0 ,/* 0x0900 */ michael@0: 0x00a1 ,/* 0x0901 */ michael@0: 0x00a2 ,/* 0x0902 */ michael@0: 0x00a3 ,/* 0x0903 */ michael@0: 0xa4e0 ,/* 0x0904 */ michael@0: 0x00a4 ,/* 0x0905 */ michael@0: 0x00a5 ,/* 0x0906 */ michael@0: 0x00a6 ,/* 0x0907 */ michael@0: 0x00a7 ,/* 0x0908 */ michael@0: 0x00a8 ,/* 0x0909 */ michael@0: 0x00a9 ,/* 0x090a */ michael@0: 0x00aa ,/* 0x090b */ michael@0: 0xA6E9 ,/* 0x090c */ michael@0: 0x00ae ,/* 0x090d */ michael@0: 0x00ab ,/* 0x090e */ michael@0: 0x00ac ,/* 0x090f */ michael@0: 0x00ad ,/* 0x0910 */ michael@0: 0x00b2 ,/* 0x0911 */ michael@0: 0x00af ,/* 0x0912 */ michael@0: 0x00b0 ,/* 0x0913 */ michael@0: 0x00b1 ,/* 0x0914 */ michael@0: 0x00b3 ,/* 0x0915 */ michael@0: 0x00b4 ,/* 0x0916 */ michael@0: 0x00b5 ,/* 0x0917 */ michael@0: 0x00b6 ,/* 0x0918 */ michael@0: 0x00b7 ,/* 0x0919 */ michael@0: 0x00b8 ,/* 0x091a */ michael@0: 0x00b9 ,/* 0x091b */ michael@0: 0x00ba ,/* 0x091c */ michael@0: 0x00bb ,/* 0x091d */ michael@0: 0x00bc ,/* 0x091e */ michael@0: 0x00bd ,/* 0x091f */ michael@0: 0x00be ,/* 0x0920 */ michael@0: 0x00bf ,/* 0x0921 */ michael@0: 0x00c0 ,/* 0x0922 */ michael@0: 0x00c1 ,/* 0x0923 */ michael@0: 0x00c2 ,/* 0x0924 */ michael@0: 0x00c3 ,/* 0x0925 */ michael@0: 0x00c4 ,/* 0x0926 */ michael@0: 0x00c5 ,/* 0x0927 */ michael@0: 0x00c6 ,/* 0x0928 */ michael@0: 0x00c7 ,/* 0x0929 */ michael@0: 0x00c8 ,/* 0x092a */ michael@0: 0x00c9 ,/* 0x092b */ michael@0: 0x00ca ,/* 0x092c */ michael@0: 0x00cb ,/* 0x092d */ michael@0: 0x00cc ,/* 0x092e */ michael@0: 0x00cd ,/* 0x092f */ michael@0: 0x00cf ,/* 0x0930 */ michael@0: 0x00d0 ,/* 0x0931 */ michael@0: 0x00d1 ,/* 0x0932 */ michael@0: 0x00d2 ,/* 0x0933 */ michael@0: 0x00d3 ,/* 0x0934 */ michael@0: 0x00d4 ,/* 0x0935 */ michael@0: 0x00d5 ,/* 0x0936 */ michael@0: 0x00d6 ,/* 0x0937 */ michael@0: 0x00d7 ,/* 0x0938 */ michael@0: 0x00d8 ,/* 0x0939 */ michael@0: 0xFFFF ,/* 0x093A */ michael@0: 0xFFFF ,/* 0x093B */ michael@0: 0x00e9 ,/* 0x093c */ michael@0: 0xEAE9 ,/* 0x093d */ michael@0: 0x00da ,/* 0x093e */ michael@0: 0x00db ,/* 0x093f */ michael@0: 0x00dc ,/* 0x0940 */ michael@0: 0x00dd ,/* 0x0941 */ michael@0: 0x00de ,/* 0x0942 */ michael@0: 0x00df ,/* 0x0943 */ michael@0: 0xDFE9 ,/* 0x0944 */ michael@0: 0x00e3 ,/* 0x0945 */ michael@0: 0x00e0 ,/* 0x0946 */ michael@0: 0x00e1 ,/* 0x0947 */ michael@0: 0x00e2 ,/* 0x0948 */ michael@0: 0x00e7 ,/* 0x0949 */ michael@0: 0x00e4 ,/* 0x094a */ michael@0: 0x00e5 ,/* 0x094b */ michael@0: 0x00e6 ,/* 0x094c */ michael@0: 0x00e8 ,/* 0x094d */ michael@0: 0x00ec ,/* 0x094e */ michael@0: 0x00ed ,/* 0x094f */ michael@0: 0xA1E9 ,/* 0x0950 */ /* OM Symbol */ michael@0: 0xFFFF ,/* 0x0951 */ michael@0: 0xF0B8 ,/* 0x0952 */ michael@0: 0xFFFF ,/* 0x0953 */ michael@0: 0xFFFF ,/* 0x0954 */ michael@0: 0xFFFF ,/* 0x0955 */ michael@0: 0xFFFF ,/* 0x0956 */ michael@0: 0xFFFF ,/* 0x0957 */ michael@0: 0xb3e9 ,/* 0x0958 */ michael@0: 0xb4e9 ,/* 0x0959 */ michael@0: 0xb5e9 ,/* 0x095a */ michael@0: 0xbae9 ,/* 0x095b */ michael@0: 0xbfe9 ,/* 0x095c */ michael@0: 0xC0E9 ,/* 0x095d */ michael@0: 0xc9e9 ,/* 0x095e */ michael@0: 0x00ce ,/* 0x095f */ michael@0: 0xAAe9 ,/* 0x0960 */ michael@0: 0xA7E9 ,/* 0x0961 */ michael@0: 0xDBE9 ,/* 0x0962 */ michael@0: 0xDCE9 ,/* 0x0963 */ michael@0: 0x00ea ,/* 0x0964 */ michael@0: 0xeaea ,/* 0x0965 */ michael@0: 0x00f1 ,/* 0x0966 */ michael@0: 0x00f2 ,/* 0x0967 */ michael@0: 0x00f3 ,/* 0x0968 */ michael@0: 0x00f4 ,/* 0x0969 */ michael@0: 0x00f5 ,/* 0x096a */ michael@0: 0x00f6 ,/* 0x096b */ michael@0: 0x00f7 ,/* 0x096c */ michael@0: 0x00f8 ,/* 0x096d */ michael@0: 0x00f9 ,/* 0x096e */ michael@0: 0x00fa ,/* 0x096f */ michael@0: 0xF0BF ,/* 0x0970 */ michael@0: 0xFFFF ,/* 0x0971 */ michael@0: 0xFFFF ,/* 0x0972 */ michael@0: 0xFFFF ,/* 0x0973 */ michael@0: 0xFFFF ,/* 0x0974 */ michael@0: 0xFFFF ,/* 0x0975 */ michael@0: 0xFFFF ,/* 0x0976 */ michael@0: 0xFFFF ,/* 0x0977 */ michael@0: 0xFFFF ,/* 0x0978 */ michael@0: 0xFFFF ,/* 0x0979 */ michael@0: 0xFFFF ,/* 0x097a */ michael@0: 0xFFFF ,/* 0x097b */ michael@0: 0xFFFF ,/* 0x097c */ michael@0: 0xFFFF ,/* 0x097d */ michael@0: 0xFFFF ,/* 0x097e */ michael@0: 0xFFFF ,/* 0x097f */ michael@0: }; michael@0: static const uint16_t toUnicodeTable[256]={ michael@0: 0x0000,/* 0x00 */ michael@0: 0x0001,/* 0x01 */ michael@0: 0x0002,/* 0x02 */ michael@0: 0x0003,/* 0x03 */ michael@0: 0x0004,/* 0x04 */ michael@0: 0x0005,/* 0x05 */ michael@0: 0x0006,/* 0x06 */ michael@0: 0x0007,/* 0x07 */ michael@0: 0x0008,/* 0x08 */ michael@0: 0x0009,/* 0x09 */ michael@0: 0x000a,/* 0x0a */ michael@0: 0x000b,/* 0x0b */ michael@0: 0x000c,/* 0x0c */ michael@0: 0x000d,/* 0x0d */ michael@0: 0x000e,/* 0x0e */ michael@0: 0x000f,/* 0x0f */ michael@0: 0x0010,/* 0x10 */ michael@0: 0x0011,/* 0x11 */ michael@0: 0x0012,/* 0x12 */ michael@0: 0x0013,/* 0x13 */ michael@0: 0x0014,/* 0x14 */ michael@0: 0x0015,/* 0x15 */ michael@0: 0x0016,/* 0x16 */ michael@0: 0x0017,/* 0x17 */ michael@0: 0x0018,/* 0x18 */ michael@0: 0x0019,/* 0x19 */ michael@0: 0x001a,/* 0x1a */ michael@0: 0x001b,/* 0x1b */ michael@0: 0x001c,/* 0x1c */ michael@0: 0x001d,/* 0x1d */ michael@0: 0x001e,/* 0x1e */ michael@0: 0x001f,/* 0x1f */ michael@0: 0x0020,/* 0x20 */ michael@0: 0x0021,/* 0x21 */ michael@0: 0x0022,/* 0x22 */ michael@0: 0x0023,/* 0x23 */ michael@0: 0x0024,/* 0x24 */ michael@0: 0x0025,/* 0x25 */ michael@0: 0x0026,/* 0x26 */ michael@0: 0x0027,/* 0x27 */ michael@0: 0x0028,/* 0x28 */ michael@0: 0x0029,/* 0x29 */ michael@0: 0x002a,/* 0x2a */ michael@0: 0x002b,/* 0x2b */ michael@0: 0x002c,/* 0x2c */ michael@0: 0x002d,/* 0x2d */ michael@0: 0x002e,/* 0x2e */ michael@0: 0x002f,/* 0x2f */ michael@0: 0x0030,/* 0x30 */ michael@0: 0x0031,/* 0x31 */ michael@0: 0x0032,/* 0x32 */ michael@0: 0x0033,/* 0x33 */ michael@0: 0x0034,/* 0x34 */ michael@0: 0x0035,/* 0x35 */ michael@0: 0x0036,/* 0x36 */ michael@0: 0x0037,/* 0x37 */ michael@0: 0x0038,/* 0x38 */ michael@0: 0x0039,/* 0x39 */ michael@0: 0x003A,/* 0x3A */ michael@0: 0x003B,/* 0x3B */ michael@0: 0x003c,/* 0x3c */ michael@0: 0x003d,/* 0x3d */ michael@0: 0x003e,/* 0x3e */ michael@0: 0x003f,/* 0x3f */ michael@0: 0x0040,/* 0x40 */ michael@0: 0x0041,/* 0x41 */ michael@0: 0x0042,/* 0x42 */ michael@0: 0x0043,/* 0x43 */ michael@0: 0x0044,/* 0x44 */ michael@0: 0x0045,/* 0x45 */ michael@0: 0x0046,/* 0x46 */ michael@0: 0x0047,/* 0x47 */ michael@0: 0x0048,/* 0x48 */ michael@0: 0x0049,/* 0x49 */ michael@0: 0x004a,/* 0x4a */ michael@0: 0x004b,/* 0x4b */ michael@0: 0x004c,/* 0x4c */ michael@0: 0x004d,/* 0x4d */ michael@0: 0x004e,/* 0x4e */ michael@0: 0x004f,/* 0x4f */ michael@0: 0x0050,/* 0x50 */ michael@0: 0x0051,/* 0x51 */ michael@0: 0x0052,/* 0x52 */ michael@0: 0x0053,/* 0x53 */ michael@0: 0x0054,/* 0x54 */ michael@0: 0x0055,/* 0x55 */ michael@0: 0x0056,/* 0x56 */ michael@0: 0x0057,/* 0x57 */ michael@0: 0x0058,/* 0x58 */ michael@0: 0x0059,/* 0x59 */ michael@0: 0x005a,/* 0x5a */ michael@0: 0x005b,/* 0x5b */ michael@0: 0x005c,/* 0x5c */ michael@0: 0x005d,/* 0x5d */ michael@0: 0x005e,/* 0x5e */ michael@0: 0x005f,/* 0x5f */ michael@0: 0x0060,/* 0x60 */ michael@0: 0x0061,/* 0x61 */ michael@0: 0x0062,/* 0x62 */ michael@0: 0x0063,/* 0x63 */ michael@0: 0x0064,/* 0x64 */ michael@0: 0x0065,/* 0x65 */ michael@0: 0x0066,/* 0x66 */ michael@0: 0x0067,/* 0x67 */ michael@0: 0x0068,/* 0x68 */ michael@0: 0x0069,/* 0x69 */ michael@0: 0x006a,/* 0x6a */ michael@0: 0x006b,/* 0x6b */ michael@0: 0x006c,/* 0x6c */ michael@0: 0x006d,/* 0x6d */ michael@0: 0x006e,/* 0x6e */ michael@0: 0x006f,/* 0x6f */ michael@0: 0x0070,/* 0x70 */ michael@0: 0x0071,/* 0x71 */ michael@0: 0x0072,/* 0x72 */ michael@0: 0x0073,/* 0x73 */ michael@0: 0x0074,/* 0x74 */ michael@0: 0x0075,/* 0x75 */ michael@0: 0x0076,/* 0x76 */ michael@0: 0x0077,/* 0x77 */ michael@0: 0x0078,/* 0x78 */ michael@0: 0x0079,/* 0x79 */ michael@0: 0x007a,/* 0x7a */ michael@0: 0x007b,/* 0x7b */ michael@0: 0x007c,/* 0x7c */ michael@0: 0x007d,/* 0x7d */ michael@0: 0x007e,/* 0x7e */ michael@0: 0x007f,/* 0x7f */ michael@0: 0x0080,/* 0x80 */ michael@0: 0x0081,/* 0x81 */ michael@0: 0x0082,/* 0x82 */ michael@0: 0x0083,/* 0x83 */ michael@0: 0x0084,/* 0x84 */ michael@0: 0x0085,/* 0x85 */ michael@0: 0x0086,/* 0x86 */ michael@0: 0x0087,/* 0x87 */ michael@0: 0x0088,/* 0x88 */ michael@0: 0x0089,/* 0x89 */ michael@0: 0x008a,/* 0x8a */ michael@0: 0x008b,/* 0x8b */ michael@0: 0x008c,/* 0x8c */ michael@0: 0x008d,/* 0x8d */ michael@0: 0x008e,/* 0x8e */ michael@0: 0x008f,/* 0x8f */ michael@0: 0x0090,/* 0x90 */ michael@0: 0x0091,/* 0x91 */ michael@0: 0x0092,/* 0x92 */ michael@0: 0x0093,/* 0x93 */ michael@0: 0x0094,/* 0x94 */ michael@0: 0x0095,/* 0x95 */ michael@0: 0x0096,/* 0x96 */ michael@0: 0x0097,/* 0x97 */ michael@0: 0x0098,/* 0x98 */ michael@0: 0x0099,/* 0x99 */ michael@0: 0x009a,/* 0x9a */ michael@0: 0x009b,/* 0x9b */ michael@0: 0x009c,/* 0x9c */ michael@0: 0x009d,/* 0x9d */ michael@0: 0x009e,/* 0x9e */ michael@0: 0x009f,/* 0x9f */ michael@0: 0x00A0,/* 0xa0 */ michael@0: 0x0901,/* 0xa1 */ michael@0: 0x0902,/* 0xa2 */ michael@0: 0x0903,/* 0xa3 */ michael@0: 0x0905,/* 0xa4 */ michael@0: 0x0906,/* 0xa5 */ michael@0: 0x0907,/* 0xa6 */ michael@0: 0x0908,/* 0xa7 */ michael@0: 0x0909,/* 0xa8 */ michael@0: 0x090a,/* 0xa9 */ michael@0: 0x090b,/* 0xaa */ michael@0: 0x090e,/* 0xab */ michael@0: 0x090f,/* 0xac */ michael@0: 0x0910,/* 0xad */ michael@0: 0x090d,/* 0xae */ michael@0: 0x0912,/* 0xaf */ michael@0: 0x0913,/* 0xb0 */ michael@0: 0x0914,/* 0xb1 */ michael@0: 0x0911,/* 0xb2 */ michael@0: 0x0915,/* 0xb3 */ michael@0: 0x0916,/* 0xb4 */ michael@0: 0x0917,/* 0xb5 */ michael@0: 0x0918,/* 0xb6 */ michael@0: 0x0919,/* 0xb7 */ michael@0: 0x091a,/* 0xb8 */ michael@0: 0x091b,/* 0xb9 */ michael@0: 0x091c,/* 0xba */ michael@0: 0x091d,/* 0xbb */ michael@0: 0x091e,/* 0xbc */ michael@0: 0x091f,/* 0xbd */ michael@0: 0x0920,/* 0xbe */ michael@0: 0x0921,/* 0xbf */ michael@0: 0x0922,/* 0xc0 */ michael@0: 0x0923,/* 0xc1 */ michael@0: 0x0924,/* 0xc2 */ michael@0: 0x0925,/* 0xc3 */ michael@0: 0x0926,/* 0xc4 */ michael@0: 0x0927,/* 0xc5 */ michael@0: 0x0928,/* 0xc6 */ michael@0: 0x0929,/* 0xc7 */ michael@0: 0x092a,/* 0xc8 */ michael@0: 0x092b,/* 0xc9 */ michael@0: 0x092c,/* 0xca */ michael@0: 0x092d,/* 0xcb */ michael@0: 0x092e,/* 0xcc */ michael@0: 0x092f,/* 0xcd */ michael@0: 0x095f,/* 0xce */ michael@0: 0x0930,/* 0xcf */ michael@0: 0x0931,/* 0xd0 */ michael@0: 0x0932,/* 0xd1 */ michael@0: 0x0933,/* 0xd2 */ michael@0: 0x0934,/* 0xd3 */ michael@0: 0x0935,/* 0xd4 */ michael@0: 0x0936,/* 0xd5 */ michael@0: 0x0937,/* 0xd6 */ michael@0: 0x0938,/* 0xd7 */ michael@0: 0x0939,/* 0xd8 */ michael@0: 0x200D,/* 0xd9 */ michael@0: 0x093e,/* 0xda */ michael@0: 0x093f,/* 0xdb */ michael@0: 0x0940,/* 0xdc */ michael@0: 0x0941,/* 0xdd */ michael@0: 0x0942,/* 0xde */ michael@0: 0x0943,/* 0xdf */ michael@0: 0x0946,/* 0xe0 */ michael@0: 0x0947,/* 0xe1 */ michael@0: 0x0948,/* 0xe2 */ michael@0: 0x0945,/* 0xe3 */ michael@0: 0x094a,/* 0xe4 */ michael@0: 0x094b,/* 0xe5 */ michael@0: 0x094c,/* 0xe6 */ michael@0: 0x0949,/* 0xe7 */ michael@0: 0x094d,/* 0xe8 */ michael@0: 0x093c,/* 0xe9 */ michael@0: 0x0964,/* 0xea */ michael@0: 0xFFFF,/* 0xeb */ michael@0: 0xFFFF,/* 0xec */ michael@0: 0xFFFF,/* 0xed */ michael@0: 0xFFFF,/* 0xee */ michael@0: 0xFFFF,/* 0xef */ michael@0: 0xFFFF,/* 0xf0 */ michael@0: 0x0966,/* 0xf1 */ michael@0: 0x0967,/* 0xf2 */ michael@0: 0x0968,/* 0xf3 */ michael@0: 0x0969,/* 0xf4 */ michael@0: 0x096a,/* 0xf5 */ michael@0: 0x096b,/* 0xf6 */ michael@0: 0x096c,/* 0xf7 */ michael@0: 0x096d,/* 0xf8 */ michael@0: 0x096e,/* 0xf9 */ michael@0: 0x096f,/* 0xfa */ michael@0: 0xFFFF,/* 0xfb */ michael@0: 0xFFFF,/* 0xfc */ michael@0: 0xFFFF,/* 0xfd */ michael@0: 0xFFFF,/* 0xfe */ michael@0: 0xFFFF /* 0xff */ michael@0: }; michael@0: michael@0: static const uint16_t vowelSignESpecialCases[][2]={ michael@0: { 2 /*length of array*/ , 0 }, michael@0: { 0xA4 , 0x0904 }, michael@0: }; michael@0: michael@0: static const uint16_t nuktaSpecialCases[][2]={ michael@0: { 16 /*length of array*/ , 0 }, michael@0: { 0xA6 , 0x090c }, michael@0: { 0xEA , 0x093D }, michael@0: { 0xDF , 0x0944 }, michael@0: { 0xA1 , 0x0950 }, michael@0: { 0xb3 , 0x0958 }, michael@0: { 0xb4 , 0x0959 }, michael@0: { 0xb5 , 0x095a }, michael@0: { 0xba , 0x095b }, michael@0: { 0xbf , 0x095c }, michael@0: { 0xC0 , 0x095d }, michael@0: { 0xc9 , 0x095e }, michael@0: { 0xAA , 0x0960 }, michael@0: { 0xA7 , 0x0961 }, michael@0: { 0xDB , 0x0962 }, michael@0: { 0xDC , 0x0963 }, michael@0: }; michael@0: michael@0: michael@0: #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \ michael@0: int32_t offset = (int32_t)(source - args->source-1); \ michael@0: /* write the targetUniChar to target */ \ michael@0: if(target < targetLimit){ \ michael@0: if(targetByteUnit <= 0xFF){ \ michael@0: *(target)++ = (uint8_t)(targetByteUnit); \ michael@0: if(offsets){ \ michael@0: *(offsets++) = offset; \ michael@0: } \ michael@0: }else{ \ michael@0: if (targetByteUnit > 0xFFFF) { \ michael@0: *(target)++ = (uint8_t)(targetByteUnit>>16); \ michael@0: if (offsets) { \ michael@0: --offset; \ michael@0: *(offsets++) = offset; \ michael@0: } \ michael@0: } \ michael@0: if (!(target < targetLimit)) { \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ michael@0: (uint8_t)(targetByteUnit >> 8); \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ michael@0: (uint8_t)targetByteUnit; \ michael@0: *err = U_BUFFER_OVERFLOW_ERROR; \ michael@0: } else { \ michael@0: *(target)++ = (uint8_t)(targetByteUnit>>8); \ michael@0: if(offsets){ \ michael@0: *(offsets++) = offset; \ michael@0: } \ michael@0: if(target < targetLimit){ \ michael@0: *(target)++ = (uint8_t) targetByteUnit; \ michael@0: if(offsets){ \ michael@0: *(offsets++) = offset ; \ michael@0: } \ michael@0: }else{ \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\ michael@0: (uint8_t) (targetByteUnit); \ michael@0: *err = U_BUFFER_OVERFLOW_ERROR; \ michael@0: } \ michael@0: } \ michael@0: } \ michael@0: }else{ \ michael@0: if (targetByteUnit & 0xFF0000) { \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ michael@0: (uint8_t) (targetByteUnit >>16); \ michael@0: } \ michael@0: if(targetByteUnit & 0xFF00){ \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ michael@0: (uint8_t) (targetByteUnit >>8); \ michael@0: } \ michael@0: args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ michael@0: (uint8_t) (targetByteUnit); \ michael@0: *err = U_BUFFER_OVERFLOW_ERROR; \ michael@0: } \ michael@0: } michael@0: michael@0: /* Rules: michael@0: * Explicit Halant : michael@0: * + michael@0: * Soft Halant : michael@0: * + michael@0: */ michael@0: michael@0: static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC( michael@0: UConverterFromUnicodeArgs * args, UErrorCode * err) { michael@0: const UChar *source = args->source; michael@0: const UChar *sourceLimit = args->sourceLimit; michael@0: unsigned char *target = (unsigned char *) args->target; michael@0: unsigned char *targetLimit = (unsigned char *) args->targetLimit; michael@0: int32_t* offsets = args->offsets; michael@0: uint32_t targetByteUnit = 0x0000; michael@0: UChar32 sourceChar = 0x0000; michael@0: UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */ michael@0: UConverterDataISCII *converterData; michael@0: uint16_t newDelta=0; michael@0: uint16_t range = 0; michael@0: UBool deltaChanged = FALSE; michael@0: michael@0: if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) { michael@0: *err = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: /* initialize data */ michael@0: converterData=(UConverterDataISCII*)args->converter->extraInfo; michael@0: newDelta=converterData->currentDeltaFromUnicode; michael@0: range = (uint16_t)(newDelta/DELTA); michael@0: michael@0: if ((sourceChar = args->converter->fromUChar32)!=0) { michael@0: goto getTrail; michael@0: } michael@0: michael@0: /*writing the char to the output stream */ michael@0: while (source < sourceLimit) { michael@0: /* Write the language code following LF only if LF is not the last character. */ michael@0: if (args->converter->fromUnicodeStatus == LF) { michael@0: targetByteUnit = ATR<<8; michael@0: targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang; michael@0: args->converter->fromUnicodeStatus = 0x0000; michael@0: /* now append ATR and language code */ michael@0: WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); michael@0: if (U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: sourceChar = *source++; michael@0: tempContextFromUnicode = converterData->contextCharFromUnicode; michael@0: michael@0: targetByteUnit = missingCharMarker; michael@0: michael@0: /*check if input is in ASCII and C0 control codes range*/ michael@0: if (sourceChar <= ASCII_END) { michael@0: args->converter->fromUnicodeStatus = sourceChar; michael@0: WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err); michael@0: if (U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: continue; michael@0: } michael@0: switch (sourceChar) { michael@0: case ZWNJ: michael@0: /* contextChar has HALANT */ michael@0: if (converterData->contextCharFromUnicode) { michael@0: converterData->contextCharFromUnicode = 0x00; michael@0: targetByteUnit = ISCII_HALANT; michael@0: } else { michael@0: /* consume ZWNJ and continue */ michael@0: converterData->contextCharFromUnicode = 0x00; michael@0: continue; michael@0: } michael@0: break; michael@0: case ZWJ: michael@0: /* contextChar has HALANT */ michael@0: if (converterData->contextCharFromUnicode) { michael@0: targetByteUnit = ISCII_NUKTA; michael@0: } else { michael@0: targetByteUnit =ISCII_INV; michael@0: } michael@0: converterData->contextCharFromUnicode = 0x00; michael@0: break; michael@0: default: michael@0: /* is the sourceChar in the INDIC_RANGE? */ michael@0: if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) { michael@0: /* Danda and Double Danda are valid in Northern scripts.. since Unicode michael@0: * does not include these codepoints in all Northern scrips we need to michael@0: * filter them out michael@0: */ michael@0: if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) { michael@0: /* find out to which block the souceChar belongs*/ michael@0: range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA); michael@0: newDelta =(uint16_t)(range*DELTA); michael@0: michael@0: /* Now are we in the same block as the previous? */ michael@0: if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) { michael@0: converterData->currentDeltaFromUnicode = newDelta; michael@0: converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum; michael@0: deltaChanged =TRUE; michael@0: converterData->isFirstBuffer=FALSE; michael@0: } michael@0: michael@0: if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { michael@0: if (sourceChar == PNJ_TIPPI) { michael@0: /* Make sure Tippi is converterd to Bindi. */ michael@0: sourceChar = PNJ_BINDI; michael@0: } else if (sourceChar == PNJ_ADHAK) { michael@0: /* This is for consonant cluster handling. */ michael@0: converterData->contextCharFromUnicode = PNJ_ADHAK; michael@0: } michael@0: michael@0: } michael@0: /* Normalize all Indic codepoints to Devanagari and map them to ISCII */ michael@0: /* now subtract the new delta from sourceChar*/ michael@0: sourceChar -= converterData->currentDeltaFromUnicode; michael@0: } michael@0: michael@0: /* get the target byte unit */ michael@0: targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar]; michael@0: michael@0: /* is the code point valid in current script? */ michael@0: if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) { michael@0: /* Vocallic RR is assigned in ISCII Telugu and Unicode */ michael@0: if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) { michael@0: targetByteUnit=missingCharMarker; michael@0: } michael@0: } michael@0: michael@0: if (deltaChanged) { michael@0: /* we are in a script block which is different than michael@0: * previous sourceChar's script block write ATR and language codes michael@0: */ michael@0: uint32_t temp=0; michael@0: temp =(uint16_t)(ATR<<8); michael@0: temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang); michael@0: /* reset */ michael@0: deltaChanged=FALSE; michael@0: /* now append ATR and language code */ michael@0: WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err); michael@0: if (U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) { michael@0: continue; michael@0: } michael@0: } michael@0: /* reset context char */ michael@0: converterData->contextCharFromUnicode = 0x00; michael@0: break; michael@0: } michael@0: if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) { michael@0: /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */ michael@0: /* reset context char */ michael@0: converterData->contextCharFromUnicode = 0x0000; michael@0: targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit; michael@0: /* write targetByteUnit to target */ michael@0: WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err); michael@0: if (U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } else if (targetByteUnit != missingCharMarker) { michael@0: if (targetByteUnit==ISCII_HALANT) { michael@0: converterData->contextCharFromUnicode = (UChar)targetByteUnit; michael@0: } michael@0: /* write targetByteUnit to target*/ michael@0: WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); michael@0: if (U_FAILURE(*err)) { michael@0: break; michael@0: } michael@0: } else { michael@0: /* oops.. the code point is unassigned */ michael@0: /*check if the char is a First surrogate*/ michael@0: if (U16_IS_SURROGATE(sourceChar)) { michael@0: if (U16_IS_SURROGATE_LEAD(sourceChar)) { michael@0: getTrail: michael@0: /*look ahead to find the trail surrogate*/ michael@0: if (source < sourceLimit) { michael@0: /* test the following code unit */ michael@0: UChar trail= (*source); michael@0: if (U16_IS_TRAIL(trail)) { michael@0: source++; michael@0: sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); michael@0: *err =U_INVALID_CHAR_FOUND; michael@0: /* convert this surrogate code point */ michael@0: /* exit this condition tree */ michael@0: } else { michael@0: /* this is an unmatched lead code unit (1st surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* no more input */ michael@0: *err = U_ZERO_ERROR; michael@0: } michael@0: } else { michael@0: /* this is an unmatched trail code unit (2nd surrogate) */ michael@0: /* callback(illegal) */ michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* callback(unassigned) for a BMP code point */ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: } michael@0: michael@0: args->converter->fromUChar32=sourceChar; michael@0: break; michael@0: } michael@0: }/* end while(mySourceIndexsource = source; michael@0: args->target = (char*)target; michael@0: } michael@0: michael@0: static const uint16_t lookupTable[][2]={ michael@0: { ZERO, ZERO }, /*DEFALT*/ michael@0: { ZERO, ZERO }, /*ROMAN*/ michael@0: { DEVANAGARI, DEV_MASK }, michael@0: { BENGALI, BNG_MASK }, michael@0: { TAMIL, TML_MASK }, michael@0: { TELUGU, KND_MASK }, michael@0: { BENGALI, BNG_MASK }, michael@0: { ORIYA, ORI_MASK }, michael@0: { KANNADA, KND_MASK }, michael@0: { MALAYALAM, MLM_MASK }, michael@0: { GUJARATI, GJR_MASK }, michael@0: { GURMUKHI, PNJ_MASK } michael@0: }; michael@0: michael@0: #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\ michael@0: /* add offset to current Indic Block */ \ michael@0: if(targetUniChar>ASCII_END && \ michael@0: targetUniChar != ZWJ && \ michael@0: targetUniChar != ZWNJ && \ michael@0: targetUniChar != DANDA && \ michael@0: targetUniChar != DOUBLE_DANDA){ \ michael@0: \ michael@0: targetUniChar+=(uint16_t)(delta); \ michael@0: } \ michael@0: /* now write the targetUniChar */ \ michael@0: if(targettargetLimit){ \ michael@0: *(target)++ = (UChar)targetUniChar; \ michael@0: if(offsets){ \ michael@0: *(offsets)++ = (int32_t)(offset); \ michael@0: } \ michael@0: }else{ \ michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \ michael@0: (UChar)targetUniChar; \ michael@0: *err = U_BUFFER_OVERFLOW_ERROR; \ michael@0: } \ michael@0: } michael@0: michael@0: #define GET_MAPPING(sourceChar,targetUniChar,data){ \ michael@0: targetUniChar = toUnicodeTable[(sourceChar)] ; \ michael@0: /* is the code point valid in current script? */ \ michael@0: if(sourceChar> ASCII_END && \ michael@0: (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \ michael@0: /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \ michael@0: if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \ michael@0: targetUniChar!=VOCALLIC_RR){ \ michael@0: targetUniChar=missingCharMarker; \ michael@0: } \ michael@0: } \ michael@0: } michael@0: michael@0: /*********** michael@0: * Rules for ISCII to Unicode converter michael@0: * ISCII is stateful encoding. To convert ISCII bytes to Unicode, michael@0: * which has both precomposed and decomposed forms characters michael@0: * pre-context and post-context need to be considered. michael@0: * michael@0: * Post context michael@0: * i) ATR : Attribute code is used to declare the font and script switching. michael@0: * Currently we only switch scripts and font codes consumed without generating an error michael@0: * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure, michael@0: * obsolete characters michael@0: * Pre context michael@0: * i) Halant: if preceeded by a halant then it is a explicit halant michael@0: * ii) Nukta : michael@0: * a) if preceeded by a halant then it is a soft halant michael@0: * b) if preceeded by specific consonants and the ligatures have pre-composed michael@0: * characters in Unicode then convert to pre-composed characters michael@0: * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda michael@0: * michael@0: */ michael@0: michael@0: static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) { michael@0: const char *source = ( char *) args->source; michael@0: UChar *target = args->target; michael@0: const char *sourceLimit = args->sourceLimit; michael@0: const UChar* targetLimit = args->targetLimit; michael@0: uint32_t targetUniChar = 0x0000; michael@0: uint8_t sourceChar = 0x0000; michael@0: UConverterDataISCII* data; michael@0: UChar32* toUnicodeStatus=NULL; michael@0: UChar32 tempTargetUniChar = 0x0000; michael@0: UChar* contextCharToUnicode= NULL; michael@0: UBool found; michael@0: int i; michael@0: int offset = 0; michael@0: michael@0: if ((args->converter == NULL) || (target < args->target) || (source < args->source)) { michael@0: *err = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: michael@0: data = (UConverterDataISCII*)(args->converter->extraInfo); michael@0: contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */ michael@0: toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/ michael@0: michael@0: while (U_SUCCESS(*err) && sourcecurrentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA); michael@0: data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1]; michael@0: } else if (sourceChar==DEF) { michael@0: /* switch back to default */ michael@0: data->currentDeltaToUnicode = data->defDeltaToUnicode; michael@0: data->currentMaskToUnicode = data->defMaskToUnicode; michael@0: } else { michael@0: if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) { michael@0: /* these are display codes consume and continue */ michael@0: } else { michael@0: *err =U_ILLEGAL_CHAR_FOUND; michael@0: /* reset */ michael@0: *contextCharToUnicode=NO_CHAR_MARKER; michael@0: goto CALLBACK; michael@0: } michael@0: } michael@0: michael@0: /* reset */ michael@0: *contextCharToUnicode=NO_CHAR_MARKER; michael@0: michael@0: continue; michael@0: michael@0: } else if (*contextCharToUnicode==EXT) { michael@0: /* check if sourceChar is in 0xA1-0xEE range */ michael@0: if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) { michael@0: /* We currently support only Anudatta and Devanagari abbreviation sign */ michael@0: if (sourceChar==0xBF || sourceChar == 0xB8) { michael@0: targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA; michael@0: michael@0: /* find out if the mapping is valid in this state */ michael@0: if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { michael@0: *contextCharToUnicode= NO_CHAR_MARKER; michael@0: michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: /* write to target */ michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); michael@0: michael@0: continue; michael@0: } michael@0: } michael@0: /* byte unit is unassigned */ michael@0: targetUniChar = missingCharMarker; michael@0: *err= U_INVALID_CHAR_FOUND; michael@0: } else { michael@0: /* only 0xA1 - 0xEE are legal after EXT char */ michael@0: *contextCharToUnicode= NO_CHAR_MARKER; michael@0: *err = U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: goto CALLBACK; michael@0: } else if (*contextCharToUnicode==ISCII_INV) { michael@0: if (sourceChar==ISCII_HALANT) { michael@0: targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */ michael@0: } else { michael@0: targetUniChar = ZWJ; michael@0: } michael@0: michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: /* write to target */ michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); michael@0: /* reset */ michael@0: *contextCharToUnicode=NO_CHAR_MARKER; michael@0: } michael@0: michael@0: /* look at the pre-context and perform special processing */ michael@0: switch (sourceChar) { michael@0: case ISCII_INV: michael@0: case EXT: /*falls through*/ michael@0: case ATR: michael@0: *contextCharToUnicode = (UChar)sourceChar; michael@0: michael@0: if (*toUnicodeStatus != missingCharMarker) { michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); michael@0: *toUnicodeStatus = missingCharMarker; michael@0: } michael@0: continue; michael@0: case ISCII_DANDA: michael@0: /* handle double danda*/ michael@0: if (*contextCharToUnicode== ISCII_DANDA) { michael@0: targetUniChar = DOUBLE_DANDA; michael@0: /* clear the context */ michael@0: *contextCharToUnicode = NO_CHAR_MARKER; michael@0: *toUnicodeStatus = missingCharMarker; michael@0: } else { michael@0: GET_MAPPING(sourceChar,targetUniChar,data); michael@0: *contextCharToUnicode = sourceChar; michael@0: } michael@0: break; michael@0: case ISCII_HALANT: michael@0: /* handle explicit halant */ michael@0: if (*contextCharToUnicode == ISCII_HALANT) { michael@0: targetUniChar = ZWNJ; michael@0: /* clear the context */ michael@0: *contextCharToUnicode = NO_CHAR_MARKER; michael@0: } else { michael@0: GET_MAPPING(sourceChar,targetUniChar,data); michael@0: *contextCharToUnicode = sourceChar; michael@0: } michael@0: break; michael@0: case 0x0A: michael@0: /* fall through */ michael@0: case 0x0D: michael@0: data->resetToDefaultToUnicode = TRUE; michael@0: GET_MAPPING(sourceChar,targetUniChar,data) michael@0: ; michael@0: *contextCharToUnicode = sourceChar; michael@0: break; michael@0: michael@0: case ISCII_VOWEL_SIGN_E: michael@0: i=1; michael@0: found=FALSE; michael@0: for (; icurrentMaskToUnicode) { michael@0: /*targetUniChar += data->currentDeltaToUnicode ;*/ michael@0: *contextCharToUnicode= NO_CHAR_MARKER; michael@0: *toUnicodeStatus = missingCharMarker; michael@0: break; michael@0: } michael@0: } michael@0: GET_MAPPING(sourceChar,targetUniChar,data); michael@0: *contextCharToUnicode = sourceChar; michael@0: break; michael@0: michael@0: case ISCII_NUKTA: michael@0: /* handle soft halant */ michael@0: if (*contextCharToUnicode == ISCII_HALANT) { michael@0: targetUniChar = ZWJ; michael@0: /* clear the context */ michael@0: *contextCharToUnicode = NO_CHAR_MARKER; michael@0: break; michael@0: } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) { michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi. michael@0: * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39). michael@0: */ michael@0: targetUniChar = PNJ_RRA; michael@0: WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); michael@0: if (U_SUCCESS(*err)) { michael@0: targetUniChar = PNJ_SIGN_VIRAMA; michael@0: WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); michael@0: if (U_SUCCESS(*err)) { michael@0: targetUniChar = PNJ_HA; michael@0: WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); michael@0: } else { michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; michael@0: } michael@0: } else { michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA; michael@0: args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA; michael@0: } michael@0: *toUnicodeStatus = missingCharMarker; michael@0: data->contextCharToUnicode = NO_CHAR_MARKER; michael@0: continue; michael@0: } else { michael@0: /* try to handle + ISCII_NUKTA special mappings */ michael@0: i=1; michael@0: found =FALSE; michael@0: for (; icurrentMaskToUnicode) { michael@0: /*targetUniChar += data->currentDeltaToUnicode ;*/ michael@0: *contextCharToUnicode= NO_CHAR_MARKER; michael@0: *toUnicodeStatus = missingCharMarker; michael@0: if (data->currentDeltaToUnicode == PNJ_DELTA) { michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); michael@0: continue; michael@0: } michael@0: break; michael@0: } michael@0: /* else fall through to default */ michael@0: } michael@0: /* else fall through to default */ michael@0: } michael@0: default:GET_MAPPING(sourceChar,targetUniChar,data) michael@0: ; michael@0: *contextCharToUnicode = sourceChar; michael@0: break; michael@0: } michael@0: michael@0: if (*toUnicodeStatus != missingCharMarker) { michael@0: /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */ michael@0: if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) && michael@0: (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && (targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus) { michael@0: /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */ michael@0: offset = (int)(source-args->source - 3); michael@0: tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */ michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err); michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */ michael@0: *toUnicodeStatus = missingCharMarker; michael@0: continue; michael@0: } else { michael@0: /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ michael@0: if (data->prevToUnicodeStatus) { michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err); michael@0: data->prevToUnicodeStatus = 0x0000; michael@0: } michael@0: /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script. michael@0: * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi. michael@0: */ michael@0: if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) { michael@0: targetUniChar = PNJ_TIPPI - PNJ_DELTA; michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err); michael@0: } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) { michael@0: /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */ michael@0: data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA; michael@0: } else { michael@0: /* write the previously mapped codepoint */ michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err); michael@0: } michael@0: } michael@0: *toUnicodeStatus = missingCharMarker; michael@0: } michael@0: michael@0: if (targetUniChar != missingCharMarker) { michael@0: /* now save the targetUniChar for delayed write */ michael@0: *toUnicodeStatus = (UChar) targetUniChar; michael@0: if (data->resetToDefaultToUnicode==TRUE) { michael@0: data->currentDeltaToUnicode = data->defDeltaToUnicode; michael@0: data->currentMaskToUnicode = data->defMaskToUnicode; michael@0: data->resetToDefaultToUnicode=FALSE; michael@0: } michael@0: } else { michael@0: michael@0: /* we reach here only if targetUniChar == missingCharMarker michael@0: * so assign codes to reason and err michael@0: */ michael@0: *err = U_INVALID_CHAR_FOUND; michael@0: CALLBACK: michael@0: args->converter->toUBytes[0] = (uint8_t) sourceChar; michael@0: args->converter->toULength = 1; michael@0: break; michael@0: } michael@0: michael@0: } else { michael@0: *err =U_BUFFER_OVERFLOW_ERROR; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: if (U_SUCCESS(*err) && args->flush && source == sourceLimit) { michael@0: /* end of the input stream */ michael@0: UConverter *cnv = args->converter; michael@0: michael@0: if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) { michael@0: /* set toUBytes[] */ michael@0: cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode; michael@0: cnv->toULength = 1; michael@0: michael@0: /* avoid looping on truncated sequences */ michael@0: *contextCharToUnicode = NO_CHAR_MARKER; michael@0: } else { michael@0: cnv->toULength = 0; michael@0: } michael@0: michael@0: if (*toUnicodeStatus != missingCharMarker) { michael@0: /* output a remaining target character */ michael@0: WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err); michael@0: *toUnicodeStatus = missingCharMarker; michael@0: } michael@0: } michael@0: michael@0: args->target = target; michael@0: args->source = source; michael@0: } michael@0: michael@0: /* structure for SafeClone calculations */ michael@0: struct cloneISCIIStruct { michael@0: UConverter cnv; michael@0: UConverterDataISCII mydata; michael@0: }; michael@0: michael@0: static UConverter * michael@0: _ISCII_SafeClone(const UConverter *cnv, michael@0: void *stackBuffer, michael@0: int32_t *pBufferSize, michael@0: UErrorCode *status) michael@0: { michael@0: struct cloneISCIIStruct * localClone; michael@0: int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct); michael@0: michael@0: if (U_FAILURE(*status)) { michael@0: return 0; michael@0: } michael@0: michael@0: if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ michael@0: *pBufferSize = bufferSizeNeeded; michael@0: return 0; michael@0: } michael@0: michael@0: localClone = (struct cloneISCIIStruct *)stackBuffer; michael@0: /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ michael@0: michael@0: uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII)); michael@0: localClone->cnv.extraInfo = &localClone->mydata; michael@0: localClone->cnv.isExtraLocal = TRUE; michael@0: michael@0: return &localClone->cnv; michael@0: } michael@0: michael@0: static void michael@0: _ISCIIGetUnicodeSet(const UConverter *cnv, michael@0: const USetAdder *sa, michael@0: UConverterUnicodeSet which, michael@0: UErrorCode *pErrorCode) michael@0: { michael@0: int32_t idx, script; michael@0: uint8_t mask; michael@0: michael@0: /* Since all ISCII versions allow switching to other ISCII michael@0: scripts, we add all roundtrippable characters to this set. */ michael@0: sa->addRange(sa->set, 0, ASCII_END); michael@0: for (script = DEVANAGARI; script <= MALAYALAM; script++) { michael@0: mask = (uint8_t)(lookupInitialData[script].maskEnum); michael@0: for (idx = 0; idx < DELTA; idx++) { michael@0: /* added check for TELUGU character */ michael@0: if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) { michael@0: sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN); michael@0: } michael@0: } michael@0: } michael@0: sa->add(sa->set, DANDA); michael@0: sa->add(sa->set, DOUBLE_DANDA); michael@0: sa->add(sa->set, ZWNJ); michael@0: sa->add(sa->set, ZWJ); michael@0: } michael@0: michael@0: static const UConverterImpl _ISCIIImpl={ michael@0: michael@0: UCNV_ISCII, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _ISCIIOpen, michael@0: _ISCIIClose, michael@0: _ISCIIReset, michael@0: michael@0: UConverter_toUnicode_ISCII_OFFSETS_LOGIC, michael@0: UConverter_toUnicode_ISCII_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, michael@0: UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, michael@0: NULL, michael@0: michael@0: NULL, michael@0: _ISCIIgetName, michael@0: NULL, michael@0: _ISCII_SafeClone, michael@0: _ISCIIGetUnicodeSet michael@0: }; michael@0: michael@0: static const UConverterStaticData _ISCIIStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "ISCII", michael@0: 0, michael@0: UCNV_IBM, michael@0: UCNV_ISCII, michael@0: 1, michael@0: 4, michael@0: { 0x1a, 0, 0, 0 }, michael@0: 0x1, michael@0: FALSE, michael@0: FALSE, michael@0: 0x0, michael@0: 0x0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ michael@0: michael@0: }; michael@0: michael@0: const UConverterSharedData _ISCIIData={ michael@0: sizeof(UConverterSharedData), michael@0: ~((uint32_t) 0), michael@0: NULL, michael@0: NULL, michael@0: &_ISCIIStaticData, michael@0: FALSE, michael@0: &_ISCIIImpl, michael@0: 0 michael@0: }; michael@0: michael@0: #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */