michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2000-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: ushape.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2000jun29 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * Arabic letter shaping implemented by Ayman Roshdy michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/ushape.h" michael@0: #include "cmemory.h" michael@0: #include "putilimp.h" michael@0: #include "ustr_imp.h" michael@0: #include "ubidi_props.h" michael@0: #include "uassert.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: /* michael@0: * This implementation is designed for 16-bit Unicode strings. michael@0: * The main assumption is that the Arabic characters and their michael@0: * presentation forms each fit into a single UChar. michael@0: * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII michael@0: * characters. michael@0: */ michael@0: michael@0: /* michael@0: * ### TODO in general for letter shaping: michael@0: * - the letter shaping code is UTF-16-unaware; needs update michael@0: * + especially invertBuffer()?! michael@0: * - needs to handle the "Arabic Tail" that is used in some legacy codepages michael@0: * as a glyph fragment of wide-glyph letters michael@0: * + IBM Unicode conversion tables map it to U+200B (ZWSP) michael@0: * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms michael@0: * + Unicode 3.2 added U+FE73 ARABIC TAIL FRAGMENT michael@0: */ michael@0: michael@0: /* definitions for Arabic letter shaping ------------------------------------ */ michael@0: michael@0: #define IRRELEVANT 4 michael@0: #define LAMTYPE 16 michael@0: #define ALEFTYPE 32 michael@0: #define LINKR 1 michael@0: #define LINKL 2 michael@0: #define APRESENT 8 michael@0: #define SHADDA 64 michael@0: #define CSHADDA 128 michael@0: #define COMBINE (SHADDA+CSHADDA) michael@0: michael@0: #define HAMZAFE_CHAR 0xfe80 michael@0: #define HAMZA06_CHAR 0x0621 michael@0: #define YEH_HAMZA_CHAR 0x0626 michael@0: #define YEH_HAMZAFE_CHAR 0xFE89 michael@0: #define LAMALEF_SPACE_SUB 0xFFFF michael@0: #define TASHKEEL_SPACE_SUB 0xFFFE michael@0: #define NEW_TAIL_CHAR 0xFE73 michael@0: #define OLD_TAIL_CHAR 0x200B michael@0: #define LAM_CHAR 0x0644 michael@0: #define SPACE_CHAR 0x0020 michael@0: #define SHADDA_CHAR 0xFE7C michael@0: #define TATWEEL_CHAR 0x0640 michael@0: #define SHADDA_TATWEEL_CHAR 0xFE7D michael@0: #define SHADDA06_CHAR 0x0651 michael@0: michael@0: #define SHAPE_MODE 0 michael@0: #define DESHAPE_MODE 1 michael@0: michael@0: struct uShapeVariables { michael@0: UChar tailChar; michael@0: uint32_t uShapeLamalefBegin; michael@0: uint32_t uShapeLamalefEnd; michael@0: uint32_t uShapeTashkeelBegin; michael@0: uint32_t uShapeTashkeelEnd; michael@0: int spacesRelativeToTextBeginEnd; michael@0: }; michael@0: michael@0: static const uint8_t tailFamilyIsolatedFinal[] = { michael@0: /* FEB1 */ 1, michael@0: /* FEB2 */ 1, michael@0: /* FEB3 */ 0, michael@0: /* FEB4 */ 0, michael@0: /* FEB5 */ 1, michael@0: /* FEB6 */ 1, michael@0: /* FEB7 */ 0, michael@0: /* FEB8 */ 0, michael@0: /* FEB9 */ 1, michael@0: /* FEBA */ 1, michael@0: /* FEBB */ 0, michael@0: /* FEBC */ 0, michael@0: /* FEBD */ 1, michael@0: /* FEBE */ 1 michael@0: }; michael@0: michael@0: static const uint8_t tashkeelMedial[] = { michael@0: /* FE70 */ 0, michael@0: /* FE71 */ 1, michael@0: /* FE72 */ 0, michael@0: /* FE73 */ 0, michael@0: /* FE74 */ 0, michael@0: /* FE75 */ 0, michael@0: /* FE76 */ 0, michael@0: /* FE77 */ 1, michael@0: /* FE78 */ 0, michael@0: /* FE79 */ 1, michael@0: /* FE7A */ 0, michael@0: /* FE7B */ 1, michael@0: /* FE7C */ 0, michael@0: /* FE7D */ 1, michael@0: /* FE7E */ 0, michael@0: /* FE7F */ 1 michael@0: }; michael@0: michael@0: static const UChar yehHamzaToYeh[] = michael@0: { michael@0: /* isolated*/ 0xFEEF, michael@0: /* final */ 0xFEF0 michael@0: }; michael@0: michael@0: static const uint8_t IrrelevantPos[] = { michael@0: 0x0, 0x2, 0x4, 0x6, michael@0: 0x8, 0xA, 0xC, 0xE michael@0: }; michael@0: michael@0: michael@0: static const UChar convertLamAlef[] = michael@0: { michael@0: /*FEF5*/ 0x0622, michael@0: /*FEF6*/ 0x0622, michael@0: /*FEF7*/ 0x0623, michael@0: /*FEF8*/ 0x0623, michael@0: /*FEF9*/ 0x0625, michael@0: /*FEFA*/ 0x0625, michael@0: /*FEFB*/ 0x0627, michael@0: /*FEFC*/ 0x0627 michael@0: }; michael@0: michael@0: static const UChar araLink[178]= michael@0: { michael@0: 1 + 32 + 256 * 0x11,/*0x0622*/ michael@0: 1 + 32 + 256 * 0x13,/*0x0623*/ michael@0: 1 + 256 * 0x15,/*0x0624*/ michael@0: 1 + 32 + 256 * 0x17,/*0x0625*/ michael@0: 1 + 2 + 256 * 0x19,/*0x0626*/ michael@0: 1 + 32 + 256 * 0x1D,/*0x0627*/ michael@0: 1 + 2 + 256 * 0x1F,/*0x0628*/ michael@0: 1 + 256 * 0x23,/*0x0629*/ michael@0: 1 + 2 + 256 * 0x25,/*0x062A*/ michael@0: 1 + 2 + 256 * 0x29,/*0x062B*/ michael@0: 1 + 2 + 256 * 0x2D,/*0x062C*/ michael@0: 1 + 2 + 256 * 0x31,/*0x062D*/ michael@0: 1 + 2 + 256 * 0x35,/*0x062E*/ michael@0: 1 + 256 * 0x39,/*0x062F*/ michael@0: 1 + 256 * 0x3B,/*0x0630*/ michael@0: 1 + 256 * 0x3D,/*0x0631*/ michael@0: 1 + 256 * 0x3F,/*0x0632*/ michael@0: 1 + 2 + 256 * 0x41,/*0x0633*/ michael@0: 1 + 2 + 256 * 0x45,/*0x0634*/ michael@0: 1 + 2 + 256 * 0x49,/*0x0635*/ michael@0: 1 + 2 + 256 * 0x4D,/*0x0636*/ michael@0: 1 + 2 + 256 * 0x51,/*0x0637*/ michael@0: 1 + 2 + 256 * 0x55,/*0x0638*/ michael@0: 1 + 2 + 256 * 0x59,/*0x0639*/ michael@0: 1 + 2 + 256 * 0x5D,/*0x063A*/ michael@0: 0, 0, 0, 0, 0, /*0x063B-0x063F*/ michael@0: 1 + 2, /*0x0640*/ michael@0: 1 + 2 + 256 * 0x61,/*0x0641*/ michael@0: 1 + 2 + 256 * 0x65,/*0x0642*/ michael@0: 1 + 2 + 256 * 0x69,/*0x0643*/ michael@0: 1 + 2 + 16 + 256 * 0x6D,/*0x0644*/ michael@0: 1 + 2 + 256 * 0x71,/*0x0645*/ michael@0: 1 + 2 + 256 * 0x75,/*0x0646*/ michael@0: 1 + 2 + 256 * 0x79,/*0x0647*/ michael@0: 1 + 256 * 0x7D,/*0x0648*/ michael@0: 1 + 256 * 0x7F,/*0x0649*/ michael@0: 1 + 2 + 256 * 0x81,/*0x064A*/ michael@0: 4 + 256 * 1, /*0x064B*/ michael@0: 4 + 128 + 256 * 1, /*0x064C*/ michael@0: 4 + 128 + 256 * 1, /*0x064D*/ michael@0: 4 + 128 + 256 * 1, /*0x064E*/ michael@0: 4 + 128 + 256 * 1, /*0x064F*/ michael@0: 4 + 128 + 256 * 1, /*0x0650*/ michael@0: 4 + 64 + 256 * 3, /*0x0651*/ michael@0: 4 + 256 * 1, /*0x0652*/ michael@0: 4 + 256 * 7, /*0x0653*/ michael@0: 4 + 256 * 8, /*0x0654*/ michael@0: 4 + 256 * 8, /*0x0655*/ michael@0: 4 + 256 * 1, /*0x0656*/ michael@0: 0, 0, 0, 0, 0, /*0x0657-0x065B*/ michael@0: 1 + 256 * 0x85,/*0x065C*/ michael@0: 1 + 256 * 0x87,/*0x065D*/ michael@0: 1 + 256 * 0x89,/*0x065E*/ michael@0: 1 + 256 * 0x8B,/*0x065F*/ michael@0: 0, 0, 0, 0, 0, /*0x0660-0x0664*/ michael@0: 0, 0, 0, 0, 0, /*0x0665-0x0669*/ michael@0: 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/ michael@0: 4 + 256 * 6, /*0x0670*/ michael@0: 1 + 8 + 256 * 0x00,/*0x0671*/ michael@0: 1 + 32, /*0x0672*/ michael@0: 1 + 32, /*0x0673*/ michael@0: 0, /*0x0674*/ michael@0: 1 + 32, /*0x0675*/ michael@0: 1, 1, /*0x0676-0x0677*/ michael@0: 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x0678-0x067D*/ michael@0: 1+2+8+256 * 0x06, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x067E-0x0683*/ michael@0: 1+2, 1+2, 1+2+8+256 * 0x2A, 1+2, /*0x0684-0x0687*/ michael@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x0688-0x0691*/ michael@0: 1, 1, 1, 1, 1, 1, 1+8+256 * 0x3A, 1, /*0x0692-0x0699*/ michael@0: 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ michael@0: 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ michael@0: 1+2, 1+2, 1+2, 1+2, 1+2, 1+2+8+256 * 0x3E, /*0x06A4-0x06AD*/ michael@0: 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/ michael@0: 1+2, 1+2+8+256 * 0x42, 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ michael@0: 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ michael@0: 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B8-0x06BF*/ michael@0: 1+2, 1+2, /*0x06B8-0x06BF*/ michael@0: 1, /*0x06C0*/ michael@0: 1+2, /*0x06C1*/ michael@0: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x06C2-0x06CB*/ michael@0: 1+2+8+256 * 0xAC, /*0x06CC*/ michael@0: 1, /*0x06CD*/ michael@0: 1+2, 1+2, 1+2, 1+2, /*0x06CE-0x06D1*/ michael@0: 1, 1 /*0x06D2-0x06D3*/ michael@0: }; michael@0: michael@0: static const uint8_t presALink[] = { michael@0: /***********0*****1*****2*****3*****4*****5*****6*****7*****8*****9*****A*****B*****C*****D*****E*****F*/ michael@0: /*FB5*/ 0, 1, 0, 0, 0, 0, 0, 1, 2,1 + 2, 0, 0, 0, 0, 0, 0, michael@0: /*FB6*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FB7*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,1 + 2, 0, 0, michael@0: /*FB8*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, michael@0: /*FB9*/ 2,1 + 2, 0, 1, 2,1 + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBA*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBB*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBC*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBD*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBE*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,1 + 2, michael@0: /*FC0*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FC1*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FC2*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FC3*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FC4*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FC5*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, michael@0: /*FC6*/ 4, 4, 4 michael@0: }; michael@0: michael@0: static const uint8_t presBLink[]= michael@0: { michael@0: /***********0*****1*****2*****3*****4*****5*****6*****7*****8*****9*****A*****B*****C*****D*****E*****F*/ michael@0: /*FE7*/1 + 2,1 + 2,1 + 2, 0,1 + 2, 0,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2, michael@0: /*FE8*/ 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2,1 + 2, 0, 1, 0, michael@0: /*FE9*/ 1, 2,1 + 2, 0, 1, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2, michael@0: /*FEA*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 0, 1, 0, 1, 0, michael@0: /*FEB*/ 1, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2, michael@0: /*FEC*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2, michael@0: /*FED*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2, michael@0: /*FEE*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 0, michael@0: /*FEF*/ 1, 0, 1, 2,1 + 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0 michael@0: }; michael@0: michael@0: static const UChar convertFBto06[] = michael@0: { michael@0: /***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/ michael@0: /*FB5*/ 0x671, 0x671, 0, 0, 0, 0, 0x67E, 0x67E, 0x67E, 0x67E, 0, 0, 0, 0, 0, 0, michael@0: /*FB6*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FB7*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x686, 0x686, 0x686, 0x686, 0, 0, michael@0: /*FB8*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x698, 0x698, 0, 0, 0x6A9, 0x6A9, michael@0: /*FB9*/ 0x6A9, 0x6A9, 0x6AF, 0x6AF, 0x6AF, 0x6AF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBA*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBB*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBC*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBD*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBE*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, michael@0: /*FBF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x6CC, 0x6CC, 0x6CC, 0x6CC michael@0: }; michael@0: michael@0: static const UChar convertFEto06[] = michael@0: { michael@0: /***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/ michael@0: /*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F, 0x650, 0x650, 0x651, 0x651, 0x652, 0x652, michael@0: /*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626, 0x626, 0x626, 0x626, 0x627, 0x627, 0x628, michael@0: /*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B, 0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C, michael@0: /*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F, 0x62F, 0x630, 0x630, 0x631, 0x631, 0x632, michael@0: /*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635, 0x635, 0x635, 0x635, 0x636, 0x636, 0x636, michael@0: /*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639, 0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A, michael@0: /*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643, 0x643, 0x643, 0x643, 0x644, 0x644, 0x644, michael@0: /*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647, 0x647, 0x647, 0x647, 0x648, 0x648, 0x649, michael@0: /*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E, 0x65E, 0x65F, 0x65F michael@0: }; michael@0: michael@0: static const uint8_t shapeTable[4][4][4]= michael@0: { michael@0: { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} }, michael@0: { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }, michael@0: { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} }, michael@0: { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} } michael@0: }; michael@0: michael@0: /* michael@0: * This function shapes European digits to Arabic-Indic digits michael@0: * in-place, writing over the input characters. michael@0: * Since we know that we are only looking for BMP code points, michael@0: * we can safely just work with code units (again, at least UTF-16). michael@0: */ michael@0: static void michael@0: _shapeToArabicDigitsWithContext(UChar *s, int32_t length, michael@0: UChar digitBase, michael@0: UBool isLogical, UBool lastStrongWasAL) { michael@0: const UBiDiProps *bdp; michael@0: int32_t i; michael@0: UChar c; michael@0: michael@0: bdp=ubidi_getSingleton(); michael@0: digitBase-=0x30; michael@0: michael@0: /* the iteration direction depends on the type of input */ michael@0: if(isLogical) { michael@0: for(i=0; i0; /* pre-decrement in the body */) { michael@0: c=s[--i]; michael@0: switch(ubidi_getClass(bdp, c)) { michael@0: case U_LEFT_TO_RIGHT: /* L */ michael@0: case U_RIGHT_TO_LEFT: /* R */ michael@0: lastStrongWasAL=FALSE; michael@0: break; michael@0: case U_RIGHT_TO_LEFT_ARABIC: /* AL */ michael@0: lastStrongWasAL=TRUE; michael@0: break; michael@0: case U_EUROPEAN_NUMBER: /* EN */ michael@0: if(lastStrongWasAL && (uint32_t)(c-0x30)<10) { michael@0: s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase was modified above */ michael@0: } michael@0: break; michael@0: default : michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* michael@0: *Name : invertBuffer michael@0: *Function : This function inverts the buffer, it's used michael@0: * in case the user specifies the buffer to be michael@0: * U_SHAPE_TEXT_DIRECTION_LOGICAL michael@0: */ michael@0: static void michael@0: invertBuffer(UChar *buffer, int32_t size, uint32_t /*options*/, int32_t lowlimit, int32_t highlimit) { michael@0: UChar temp; michael@0: int32_t i=0,j=0; michael@0: for(i=lowlimit,j=size-highlimit-1;i= 0x0622 && ch <= 0x06D3) { michael@0: return(araLink[ch-0x0622]); michael@0: } else if(ch == 0x200D) { michael@0: return(3); michael@0: } else if(ch >= 0x206D && ch <= 0x206F) { michael@0: return(4); michael@0: }else if(ch >= 0xFB50 && ch <= 0xFC62) { michael@0: return(presALink[ch-0xFB50]); michael@0: } else if(ch >= 0xFE70 && ch <= 0xFEFC) { michael@0: return(presBLink[ch-0xFE70]); michael@0: }else { michael@0: return(0); michael@0: } michael@0: } michael@0: michael@0: /* michael@0: *Name : countSpaces michael@0: *Function : Counts the number of spaces michael@0: * at each end of the logical buffer michael@0: */ michael@0: static void michael@0: countSpaces(UChar *dest, int32_t size, uint32_t /*options*/, int32_t *spacesCountl, int32_t *spacesCountr) { michael@0: int32_t i = 0; michael@0: int32_t countl = 0,countr = 0; michael@0: while((dest[i] == SPACE_CHAR) && (countl < size)) { michael@0: countl++; michael@0: i++; michael@0: } michael@0: if (countl < size) { /* the entire buffer is not all space */ michael@0: while(dest[size-1] == SPACE_CHAR) { michael@0: countr++; michael@0: size--; michael@0: } michael@0: } michael@0: *spacesCountl = countl; michael@0: *spacesCountr = countr; michael@0: } michael@0: michael@0: /* michael@0: *Name : isTashkeelChar michael@0: *Function : Returns 1 for Tashkeel characters in 06 range else return 0 michael@0: */ michael@0: static inline int32_t michael@0: isTashkeelChar(UChar ch) { michael@0: return (int32_t)( ch>=0x064B && ch<= 0x0652 ); michael@0: } michael@0: michael@0: /* michael@0: *Name : isTashkeelCharFE michael@0: *Function : Returns 1 for Tashkeel characters in FE range else return 0 michael@0: */ michael@0: static inline int32_t michael@0: isTashkeelCharFE(UChar ch) { michael@0: return (int32_t)( ch>=0xFE70 && ch<= 0xFE7F ); michael@0: } michael@0: michael@0: /* michael@0: *Name : isAlefChar michael@0: *Function : Returns 1 for Alef characters else return 0 michael@0: */ michael@0: static inline int32_t michael@0: isAlefChar(UChar ch) { michael@0: return (int32_t)( (ch==0x0622)||(ch==0x0623)||(ch==0x0625)||(ch==0x0627) ); michael@0: } michael@0: michael@0: /* michael@0: *Name : isLamAlefChar michael@0: *Function : Returns 1 for LamAlef characters else return 0 michael@0: */ michael@0: static inline int32_t michael@0: isLamAlefChar(UChar ch) { michael@0: return (int32_t)((ch>=0xFEF5)&&(ch<=0xFEFC) ); michael@0: } michael@0: michael@0: /*BIDI michael@0: *Name : isTailChar michael@0: *Function : returns 1 if the character matches one of the tail characters (0xfe73 or 0x200b) otherwise returns 0 michael@0: */ michael@0: michael@0: static inline int32_t michael@0: isTailChar(UChar ch) { michael@0: if(ch == OLD_TAIL_CHAR || ch == NEW_TAIL_CHAR){ michael@0: return 1; michael@0: }else{ michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /*BIDI michael@0: *Name : isSeenTailFamilyChar michael@0: *Function : returns 1 if the character is a seen family isolated character michael@0: * in the FE range otherwise returns 0 michael@0: */ michael@0: michael@0: static inline int32_t michael@0: isSeenTailFamilyChar(UChar ch) { michael@0: if(ch >= 0xfeb1 && ch < 0xfebf){ michael@0: return tailFamilyIsolatedFinal [ch - 0xFEB1]; michael@0: }else{ michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* Name : isSeenFamilyChar michael@0: * Function : returns 1 if the character is a seen family character in the Unicode michael@0: * 06 range otherwise returns 0 michael@0: */ michael@0: michael@0: static inline int32_t michael@0: isSeenFamilyChar(UChar ch){ michael@0: if(ch >= 0x633 && ch <= 0x636){ michael@0: return 1; michael@0: }else { michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /*Start of BIDI*/ michael@0: /* michael@0: *Name : isAlefMaksouraChar michael@0: *Function : returns 1 if the character is a Alef Maksoura Final or isolated michael@0: * otherwise returns 0 michael@0: */ michael@0: static inline int32_t michael@0: isAlefMaksouraChar(UChar ch) { michael@0: return (int32_t)( (ch == 0xFEEF) || ( ch == 0xFEF0) || (ch == 0x0649)); michael@0: } michael@0: michael@0: /* michael@0: * Name : isYehHamzaChar michael@0: * Function : returns 1 if the character is a yehHamza isolated or yehhamza michael@0: * final is found otherwise returns 0 michael@0: */ michael@0: static inline int32_t michael@0: isYehHamzaChar(UChar ch) { michael@0: if((ch==0xFE89)||(ch==0xFE8A)){ michael@0: return 1; michael@0: }else{ michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Name: isTashkeelOnTatweelChar michael@0: * Function: Checks if the Tashkeel Character is on Tatweel or not,if the michael@0: * Tashkeel on tatweel (FE range), it returns 1 else if the michael@0: * Tashkeel with shadda on tatweel (FC range)return 2 otherwise michael@0: * returns 0 michael@0: */ michael@0: static inline int32_t michael@0: isTashkeelOnTatweelChar(UChar ch){ michael@0: if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75 && ch != SHADDA_TATWEEL_CHAR) michael@0: { michael@0: return tashkeelMedial [ch - 0xFE70]; michael@0: }else if( (ch >= 0xfcf2 && ch <= 0xfcf4) || (ch == SHADDA_TATWEEL_CHAR)) { michael@0: return 2; michael@0: }else{ michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Name: isIsolatedTashkeelChar michael@0: * Function: Checks if the Tashkeel Character is in the isolated form michael@0: * (i.e. Unicode FE range) returns 1 else if the Tashkeel michael@0: * with shadda is in the isolated form (i.e. Unicode FC range) michael@0: * returns 2 otherwise returns 0 michael@0: */ michael@0: static inline int32_t michael@0: isIsolatedTashkeelChar(UChar ch){ michael@0: if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75){ michael@0: return (1 - tashkeelMedial [ch - 0xFE70]); michael@0: }else if(ch >= 0xfc5e && ch <= 0xfc63){ michael@0: return 1; michael@0: }else{ michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: /* michael@0: *Name : calculateSize michael@0: *Function : This function calculates the destSize to be used in preflighting michael@0: * when the destSize is equal to 0 michael@0: * It is used also to calculate the new destsize in case the michael@0: * destination buffer will be resized. michael@0: */ michael@0: michael@0: static int32_t michael@0: calculateSize(const UChar *source, int32_t sourceLength, michael@0: int32_t destSize,uint32_t options) { michael@0: int32_t i = 0; michael@0: michael@0: int lamAlefOption = 0; michael@0: int tashkeelOption = 0; michael@0: michael@0: destSize = sourceLength; michael@0: michael@0: if (((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE || michael@0: ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED )) && michael@0: ((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE )){ michael@0: lamAlefOption = 1; michael@0: } michael@0: if((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE && michael@0: ((options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_RESIZE ) ){ michael@0: tashkeelOption = 1; michael@0: } michael@0: michael@0: if(lamAlefOption || tashkeelOption){ michael@0: if((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_VISUAL_LTR) { michael@0: for(i=0;i= 0) { michael@0: tempbuffer[i] = 0x0000; michael@0: i--; michael@0: count--; michael@0: } michael@0: michael@0: uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); michael@0: destSize = u_strlen(dest); michael@0: } michael@0: michael@0: lamAlefOption = 0; michael@0: michael@0: if (shapingMode == 0){ michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_NEAR ){ michael@0: lamAlefOption = 1; michael@0: } michael@0: } michael@0: michael@0: if (lamAlefOption){ michael@0: /* Lam+Alef is already shaped into LamAlef + FFFF */ michael@0: i = 0; michael@0: while(i < sourceLength) { michael@0: if(lamAlefOption&&dest[i] == LAMALEF_SPACE_SUB){ michael@0: dest[i] = SPACE_CHAR; michael@0: } michael@0: i++; michael@0: } michael@0: destSize = sourceLength; michael@0: } michael@0: lamAlefOption = 0; michael@0: tashkeelOption = 0; michael@0: michael@0: if (shapingMode == 0) { michael@0: if ( ((options&U_SHAPE_LAMALEF_MASK) == shapeVars.uShapeLamalefBegin) || michael@0: (((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO ) michael@0: && (shapeVars.spacesRelativeToTextBeginEnd==1)) ) { michael@0: lamAlefOption = 1; michael@0: } michael@0: if ( (options&U_SHAPE_TASHKEEL_MASK) == shapeVars.uShapeTashkeelBegin ) { michael@0: tashkeelOption = 1; michael@0: } michael@0: } michael@0: michael@0: if(lamAlefOption || tashkeelOption){ michael@0: uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: i = j = sourceLength; count = 0; michael@0: michael@0: while(i >= 0) { michael@0: if ( (lamAlefOption && dest[i] == LAMALEF_SPACE_SUB) || michael@0: (tashkeelOption && dest[i] == TASHKEEL_SPACE_SUB) ){ michael@0: j++; michael@0: count++; michael@0: }else { michael@0: tempbuffer[j] = dest[i]; michael@0: } michael@0: i--; michael@0: j--; michael@0: } michael@0: michael@0: for(i=0 ;i < count; i++){ michael@0: tempbuffer[i] = SPACE_CHAR; michael@0: } michael@0: michael@0: uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); michael@0: destSize = sourceLength; michael@0: } michael@0: michael@0: michael@0: michael@0: lamAlefOption = 0; michael@0: tashkeelOption = 0; michael@0: michael@0: if (shapingMode == 0) { michael@0: if ( ((options&U_SHAPE_LAMALEF_MASK) == shapeVars.uShapeLamalefEnd) || michael@0: (((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO ) michael@0: && (shapeVars.spacesRelativeToTextBeginEnd==0)) ) { michael@0: lamAlefOption = 1; michael@0: } michael@0: if ( (options&U_SHAPE_TASHKEEL_MASK) == shapeVars.uShapeTashkeelEnd ){ michael@0: tashkeelOption = 1; michael@0: } michael@0: } michael@0: michael@0: if(lamAlefOption || tashkeelOption){ michael@0: uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: i = j = 0; count = 0; michael@0: while(i < sourceLength) { michael@0: if ( (lamAlefOption && dest[i] == LAMALEF_SPACE_SUB) || michael@0: (tashkeelOption && dest[i] == TASHKEEL_SPACE_SUB) ){ michael@0: j--; michael@0: count++; michael@0: }else { michael@0: tempbuffer[j] = dest[i]; michael@0: } michael@0: i++; michael@0: j++; michael@0: } michael@0: michael@0: while(count >= 0) { michael@0: tempbuffer[i] = SPACE_CHAR; michael@0: i--; michael@0: count--; michael@0: } michael@0: michael@0: uprv_memcpy(dest,tempbuffer, sourceLength*U_SIZEOF_UCHAR); michael@0: destSize = sourceLength; michael@0: } michael@0: michael@0: michael@0: if(tempbuffer){ michael@0: uprv_free(tempbuffer); michael@0: } michael@0: michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: *Name :expandCompositCharAtBegin michael@0: *Function :Expands the LamAlef character to Lam and Alef consuming the required michael@0: * space from beginning of the buffer. If the text type was visual_LTR michael@0: * and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected michael@0: * the spaces will be located at end of buffer. michael@0: * If there are no spaces to expand the LamAlef, an error michael@0: * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h michael@0: */ michael@0: michael@0: static int32_t michael@0: expandCompositCharAtBegin(UChar *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode) { michael@0: int32_t i = 0,j = 0; michael@0: int32_t countl = 0; michael@0: UChar *tempbuffer=NULL; michael@0: michael@0: tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: /* Test for NULL */ michael@0: if(tempbuffer == NULL) { michael@0: *pErrorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: i = 0; michael@0: while(dest[i] == SPACE_CHAR) { michael@0: countl++; michael@0: i++; michael@0: } michael@0: michael@0: i = j = sourceLength-1; michael@0: michael@0: while(i >= 0 && j >= 0) { michael@0: if( countl>0 && isLamAlefChar(dest[i])) { michael@0: tempbuffer[j] = LAM_CHAR; michael@0: /* to ensure the array index is within the range */ michael@0: U_ASSERT(dest[i] >= 0xFEF5u michael@0: && dest[i]-0xFEF5u < sizeof(convertLamAlef)/sizeof(convertLamAlef[0])); michael@0: tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ]; michael@0: j--; michael@0: countl--; michael@0: }else { michael@0: if( countl == 0 && isLamAlefChar(dest[i]) ) { michael@0: *pErrorCode=U_NO_SPACE_AVAILABLE; michael@0: } michael@0: tempbuffer[j] = dest[i]; michael@0: } michael@0: i--; michael@0: j--; michael@0: } michael@0: uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); michael@0: michael@0: uprv_free(tempbuffer); michael@0: michael@0: destSize = sourceLength; michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: *Name : expandCompositCharAtEnd michael@0: *Function : Expands the LamAlef character to Lam and Alef consuming the michael@0: * required space from end of the buffer. If the text type was michael@0: * Visual LTR and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END michael@0: * was used, the spaces will be consumed from begin of buffer. If michael@0: * there are no spaces to expand the LamAlef, an error michael@0: * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h michael@0: */ michael@0: michael@0: static int32_t michael@0: expandCompositCharAtEnd(UChar *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode) { michael@0: int32_t i = 0,j = 0; michael@0: michael@0: int32_t countr = 0; michael@0: int32_t inpsize = sourceLength; michael@0: michael@0: UChar *tempbuffer=NULL; michael@0: tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: /* Test for NULL */ michael@0: if(tempbuffer == NULL) { michael@0: *pErrorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); michael@0: michael@0: while(dest[inpsize-1] == SPACE_CHAR) { michael@0: countr++; michael@0: inpsize--; michael@0: } michael@0: michael@0: i = sourceLength - countr - 1; michael@0: j = sourceLength - 1; michael@0: michael@0: while(i >= 0 && j >= 0) { michael@0: if( countr>0 && isLamAlefChar(dest[i]) ) { michael@0: tempbuffer[j] = LAM_CHAR; michael@0: tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ]; michael@0: j--; michael@0: countr--; michael@0: }else { michael@0: if ((countr == 0) && isLamAlefChar(dest[i]) ) { michael@0: *pErrorCode=U_NO_SPACE_AVAILABLE; michael@0: } michael@0: tempbuffer[j] = dest[i]; michael@0: } michael@0: i--; michael@0: j--; michael@0: } michael@0: michael@0: if(countr > 0) { michael@0: uprv_memmove(tempbuffer, tempbuffer+countr, sourceLength*U_SIZEOF_UCHAR); michael@0: if(u_strlen(tempbuffer) < sourceLength) { michael@0: for(i=sourceLength-1;i>=sourceLength-countr;i--) { michael@0: tempbuffer[i] = SPACE_CHAR; michael@0: } michael@0: } michael@0: } michael@0: uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); michael@0: michael@0: uprv_free(tempbuffer); michael@0: michael@0: destSize = sourceLength; michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: *Name : expandCompositCharAtNear michael@0: *Function : Expands the LamAlef character into Lam + Alef, YehHamza character michael@0: * into Yeh + Hamza, SeenFamily character into SeenFamily character michael@0: * + Tail, while consuming the space next to the character. michael@0: * If there are no spaces next to the character, an error michael@0: * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h michael@0: */ michael@0: michael@0: static int32_t michael@0: expandCompositCharAtNear(UChar *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode, michael@0: int yehHamzaOption, int seenTailOption, int lamAlefOption, struct uShapeVariables shapeVars) { michael@0: int32_t i = 0; michael@0: michael@0: michael@0: UChar lamalefChar, yehhamzaChar; michael@0: michael@0: for(i = 0 ;i<=sourceLength-1;i++) { michael@0: if (seenTailOption && isSeenTailFamilyChar(dest[i])) { michael@0: if ((i>0) && (dest[i-1] == SPACE_CHAR) ) { michael@0: dest[i-1] = shapeVars.tailChar; michael@0: }else { michael@0: *pErrorCode=U_NO_SPACE_AVAILABLE; michael@0: } michael@0: }else if(yehHamzaOption && (isYehHamzaChar(dest[i])) ) { michael@0: if ((i>0) && (dest[i-1] == SPACE_CHAR) ) { michael@0: yehhamzaChar = dest[i]; michael@0: dest[i] = yehHamzaToYeh[yehhamzaChar - YEH_HAMZAFE_CHAR]; michael@0: dest[i-1] = HAMZAFE_CHAR; michael@0: }else { michael@0: michael@0: *pErrorCode=U_NO_SPACE_AVAILABLE; michael@0: } michael@0: }else if(lamAlefOption && isLamAlefChar(dest[i+1])) { michael@0: if(dest[i] == SPACE_CHAR){ michael@0: lamalefChar = dest[i+1]; michael@0: dest[i+1] = LAM_CHAR; michael@0: dest[i] = convertLamAlef[ lamalefChar - 0xFEF5 ]; michael@0: }else { michael@0: *pErrorCode=U_NO_SPACE_AVAILABLE; michael@0: } michael@0: } michael@0: } michael@0: destSize = sourceLength; michael@0: return destSize; michael@0: } michael@0: /* michael@0: * Name : expandCompositChar michael@0: * Function : LamAlef, need special handling, since it expands from one michael@0: * character into two characters while shaping or deshaping. michael@0: * In order to expand it, near or far spaces according to the michael@0: * options user specifies. Also buffer size can be increased. michael@0: * michael@0: * For SeenFamily characters and YehHamza only the near option is michael@0: * supported, while for LamAlef we can take spaces from begin, end, michael@0: * near or even increase the buffer size. michael@0: * There is also the Auto option for LamAlef only, which will first michael@0: * search for a space at end, begin then near, respectively. michael@0: * If there are no spaces to expand these characters, an error will be set to michael@0: * U_NO_SPACE_AVAILABLE as defined in utypes.h michael@0: */ michael@0: michael@0: static int32_t michael@0: expandCompositChar(UChar *dest, int32_t sourceLength, michael@0: int32_t destSize,uint32_t options, michael@0: UErrorCode *pErrorCode, int shapingMode,struct uShapeVariables shapeVars) { michael@0: michael@0: int32_t i = 0,j = 0; michael@0: michael@0: UChar *tempbuffer=NULL; michael@0: int yehHamzaOption = 0; michael@0: int seenTailOption = 0; michael@0: int lamAlefOption = 0; michael@0: michael@0: if (shapingMode == 1){ michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO){ michael@0: michael@0: if(shapeVars.spacesRelativeToTextBeginEnd == 0) { michael@0: destSize = expandCompositCharAtEnd(dest, sourceLength, destSize, pErrorCode); michael@0: michael@0: if(*pErrorCode == U_NO_SPACE_AVAILABLE) { michael@0: *pErrorCode = U_ZERO_ERROR; michael@0: destSize = expandCompositCharAtBegin(dest, sourceLength, destSize, pErrorCode); michael@0: } michael@0: }else { michael@0: destSize = expandCompositCharAtBegin(dest, sourceLength, destSize, pErrorCode); michael@0: michael@0: if(*pErrorCode == U_NO_SPACE_AVAILABLE) { michael@0: *pErrorCode = U_ZERO_ERROR; michael@0: destSize = expandCompositCharAtEnd(dest, sourceLength, destSize, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: if(*pErrorCode == U_NO_SPACE_AVAILABLE) { michael@0: *pErrorCode = U_ZERO_ERROR; michael@0: destSize = expandCompositCharAtNear(dest, sourceLength, destSize, pErrorCode, yehHamzaOption, michael@0: seenTailOption, 1,shapeVars); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (shapingMode == 1){ michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == shapeVars.uShapeLamalefEnd){ michael@0: destSize = expandCompositCharAtEnd(dest, sourceLength, destSize, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: if (shapingMode == 1){ michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == shapeVars.uShapeLamalefBegin){ michael@0: destSize = expandCompositCharAtBegin(dest, sourceLength, destSize, pErrorCode); michael@0: } michael@0: } michael@0: michael@0: if (shapingMode == 0){ michael@0: if ((options&U_SHAPE_YEHHAMZA_MASK) == U_SHAPE_YEHHAMZA_TWOCELL_NEAR){ michael@0: yehHamzaOption = 1; michael@0: } michael@0: if ((options&U_SHAPE_SEEN_MASK) == U_SHAPE_SEEN_TWOCELL_NEAR){ michael@0: seenTailOption = 1; michael@0: } michael@0: } michael@0: if (shapingMode == 1) { michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_NEAR) { michael@0: lamAlefOption = 1; michael@0: } michael@0: } michael@0: michael@0: michael@0: if (yehHamzaOption || seenTailOption || lamAlefOption){ michael@0: destSize = expandCompositCharAtNear(dest, sourceLength, destSize, pErrorCode, yehHamzaOption, michael@0: seenTailOption,lamAlefOption,shapeVars); michael@0: } michael@0: michael@0: michael@0: if (shapingMode == 1){ michael@0: if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE){ michael@0: destSize = calculateSize(dest,sourceLength,destSize,options); michael@0: tempbuffer = (UChar *)uprv_malloc((destSize+1)*U_SIZEOF_UCHAR); michael@0: michael@0: /* Test for NULL */ michael@0: if(tempbuffer == NULL) { michael@0: *pErrorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: uprv_memset(tempbuffer, 0, (destSize+1)*U_SIZEOF_UCHAR); michael@0: michael@0: i = j = 0; michael@0: while(i < destSize && j < destSize) { michael@0: if(isLamAlefChar(dest[i]) ) { michael@0: tempbuffer[j] = convertLamAlef[ dest[i] - 0xFEF5 ]; michael@0: tempbuffer[j+1] = LAM_CHAR; michael@0: j++; michael@0: }else { michael@0: tempbuffer[j] = dest[i]; michael@0: } michael@0: i++; michael@0: j++; michael@0: } michael@0: michael@0: uprv_memcpy(dest, tempbuffer, destSize*U_SIZEOF_UCHAR); michael@0: } michael@0: } michael@0: michael@0: if(tempbuffer) { michael@0: uprv_free(tempbuffer); michael@0: } michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: *Name : shapeUnicode michael@0: *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped michael@0: * arabic Unicode buffer in FExx Range michael@0: */ michael@0: static int32_t michael@0: shapeUnicode(UChar *dest, int32_t sourceLength, michael@0: int32_t destSize,uint32_t options, michael@0: UErrorCode *pErrorCode, michael@0: int tashkeelFlag, struct uShapeVariables shapeVars) { michael@0: michael@0: int32_t i, iend; michael@0: int32_t step; michael@0: int32_t lastPos,Nx, Nw; michael@0: unsigned int Shape; michael@0: int32_t lamalef_found = 0; michael@0: int32_t seenfamFound = 0, yehhamzaFound =0, tashkeelFound = 0; michael@0: UChar prevLink = 0, lastLink = 0, currLink, nextLink = 0; michael@0: UChar wLamalef; michael@0: michael@0: /* michael@0: * Converts the input buffer from FExx Range into 06xx Range michael@0: * to make sure that all characters are in the 06xx range michael@0: * even the lamalef is converted to the special region in michael@0: * the 06xx range michael@0: */ michael@0: if ((options & U_SHAPE_PRESERVE_PRESENTATION_MASK) == U_SHAPE_PRESERVE_PRESENTATION_NOOP) { michael@0: for (i = 0; i < sourceLength; i++) { michael@0: UChar inputChar = dest[i]; michael@0: if ( (inputChar >= 0xFB50) && (inputChar <= 0xFBFF)) { michael@0: UChar c = convertFBto06 [ (inputChar - 0xFB50) ]; michael@0: if (c != 0) michael@0: dest[i] = c; michael@0: } else if ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) { michael@0: dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ; michael@0: } else { michael@0: dest[i] = inputChar ; michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: /* sets the index to the end of the buffer, together with the step point to -1 */ michael@0: i = sourceLength - 1; michael@0: iend = -1; michael@0: step = -1; michael@0: michael@0: /* michael@0: * This function resolves the link between the characters . michael@0: * Arabic characters have four forms : michael@0: * Isolated Form, Initial Form, Middle Form and Final Form michael@0: */ michael@0: currLink = getLink(dest[i]); michael@0: michael@0: lastPos = i; michael@0: Nx = -2, Nw = 0; michael@0: michael@0: while (i != iend) { michael@0: /* If high byte of currLink > 0 then more than one shape */ michael@0: if ((currLink & 0xFF00) > 0 || (getLink(dest[i]) & IRRELEVANT) != 0) { michael@0: Nw = i + step; michael@0: while (Nx < 0) { /* we need to know about next char */ michael@0: if(Nw == iend) { michael@0: nextLink = 0; michael@0: Nx = 3000; michael@0: } else { michael@0: nextLink = getLink(dest[Nw]); michael@0: if((nextLink & IRRELEVANT) == 0) { michael@0: Nx = Nw; michael@0: } else { michael@0: Nw = Nw + step; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) { michael@0: lamalef_found = 1; michael@0: wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */ michael@0: if ( wLamalef != 0) { michael@0: dest[i] = LAMALEF_SPACE_SUB; /* The default case is to drop the Alef and replace */ michael@0: dest[lastPos] =wLamalef; /* it by LAMALEF_SPACE_SUB which is the last character in the */ michael@0: i=lastPos; /* unicode private use area, this is done to make */ michael@0: } /* sure that removeLamAlefSpaces() handles only the */ michael@0: lastLink = prevLink; /* spaces generated during lamalef generation. */ michael@0: currLink = getLink(wLamalef); /* LAMALEF_SPACE_SUB is added here and is replaced by spaces */ michael@0: } /* in removeLamAlefSpaces() */ michael@0: michael@0: if ((i > 0) && (dest[i-1] == SPACE_CHAR)){ michael@0: if ( isSeenFamilyChar(dest[i])) { michael@0: seenfamFound = 1; michael@0: } else if (dest[i] == YEH_HAMZA_CHAR) { michael@0: yehhamzaFound = 1; michael@0: } michael@0: } michael@0: else if(i==0){ michael@0: if ( isSeenFamilyChar(dest[i])){ michael@0: seenfamFound = 1; michael@0: } else if (dest[i] == YEH_HAMZA_CHAR) { michael@0: yehhamzaFound = 1; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * get the proper shape according to link ability of neighbors michael@0: * and of character; depends on the order of the shapes michael@0: * (isolated, initial, middle, final) in the compatibility area michael@0: */ michael@0: Shape = shapeTable[nextLink & (LINKR + LINKL)] michael@0: [lastLink & (LINKR + LINKL)] michael@0: [currLink & (LINKR + LINKL)]; michael@0: michael@0: if ((currLink & (LINKR+LINKL)) == 1) { michael@0: Shape &= 1; michael@0: } else if(isTashkeelChar(dest[i])) { michael@0: if( (lastLink & LINKL) && (nextLink & LINKR) && (tashkeelFlag == 1) && michael@0: dest[i] != 0x064C && dest[i] != 0x064D ) michael@0: { michael@0: Shape = 1; michael@0: if( (nextLink&ALEFTYPE) == ALEFTYPE && (lastLink&LAMTYPE) == LAMTYPE ) { michael@0: Shape = 0; michael@0: } michael@0: } else if(tashkeelFlag == 2 && dest[i] == SHADDA06_CHAR){ michael@0: Shape = 1; michael@0: } else { michael@0: Shape = 0; michael@0: } michael@0: } michael@0: if ((dest[i] ^ 0x0600) < 0x100) { michael@0: if ( isTashkeelChar(dest[i]) ){ michael@0: if (tashkeelFlag == 2 && dest[i] != SHADDA06_CHAR){ michael@0: dest[i] = TASHKEEL_SPACE_SUB; michael@0: tashkeelFound = 1; michael@0: } else { michael@0: /* to ensure the array index is within the range */ michael@0: U_ASSERT(dest[i] >= 0x064Bu michael@0: && dest[i]-0x064Bu < sizeof(IrrelevantPos)/sizeof(IrrelevantPos[0])); michael@0: dest[i] = 0xFE70 + IrrelevantPos[(dest[i] - 0x064B)] + Shape; michael@0: } michael@0: }else if ((currLink & APRESENT) > 0) { michael@0: dest[i] = (UChar)(0xFB50 + (currLink >> 8) + Shape); michael@0: }else if ((currLink >> 8) > 0 && (currLink & IRRELEVANT) == 0) { michael@0: dest[i] = (UChar)(0xFE70 + (currLink >> 8) + Shape); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* move one notch forward */ michael@0: if ((currLink & IRRELEVANT) == 0) { michael@0: prevLink = lastLink; michael@0: lastLink = currLink; michael@0: lastPos = i; michael@0: } michael@0: michael@0: i = i + step; michael@0: if (i == Nx) { michael@0: currLink = nextLink; michael@0: Nx = -2; michael@0: } else if(i != iend) { michael@0: currLink = getLink(dest[i]); michael@0: } michael@0: } michael@0: destSize = sourceLength; michael@0: if ( (lamalef_found != 0 ) || (tashkeelFound != 0) ){ michael@0: destSize = handleGeneratedSpaces(dest,sourceLength,destSize,options,pErrorCode, shapeVars); michael@0: } michael@0: michael@0: if ( (seenfamFound != 0) || (yehhamzaFound != 0) ) { michael@0: destSize = expandCompositChar(dest, sourceLength,destSize,options,pErrorCode, SHAPE_MODE,shapeVars); michael@0: } michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: *Name : deShapeUnicode michael@0: *Function : Converts an Arabic Unicode buffer in FExx Range into unshaped michael@0: * arabic Unicode buffer in 06xx Range michael@0: */ michael@0: static int32_t michael@0: deShapeUnicode(UChar *dest, int32_t sourceLength, michael@0: int32_t destSize,uint32_t options, michael@0: UErrorCode *pErrorCode, struct uShapeVariables shapeVars) { michael@0: int32_t i = 0; michael@0: int32_t lamalef_found = 0; michael@0: int32_t yehHamzaComposeEnabled = 0; michael@0: int32_t seenComposeEnabled = 0; michael@0: michael@0: yehHamzaComposeEnabled = ((options&U_SHAPE_YEHHAMZA_MASK) == U_SHAPE_YEHHAMZA_TWOCELL_NEAR) ? 1 : 0; michael@0: seenComposeEnabled = ((options&U_SHAPE_SEEN_MASK) == U_SHAPE_SEEN_TWOCELL_NEAR)? 1 : 0; michael@0: michael@0: /* michael@0: *This for loop changes the buffer from the Unicode FE range to michael@0: *the Unicode 06 range michael@0: */ michael@0: michael@0: for(i = 0; i < sourceLength; i++) { michael@0: UChar inputChar = dest[i]; michael@0: if ( (inputChar >= 0xFB50) && (inputChar <= 0xFBFF)) { /* FBxx Arabic range */ michael@0: UChar c = convertFBto06 [ (inputChar - 0xFB50) ]; michael@0: if (c != 0) michael@0: dest[i] = c; michael@0: } else if( (yehHamzaComposeEnabled == 1) && ((inputChar == HAMZA06_CHAR) || (inputChar == HAMZAFE_CHAR)) michael@0: && (i < (sourceLength - 1)) && isAlefMaksouraChar(dest[i+1] )) { michael@0: dest[i] = SPACE_CHAR; michael@0: dest[i+1] = YEH_HAMZA_CHAR; michael@0: } else if ( (seenComposeEnabled == 1) && (isTailChar(inputChar)) && (i< (sourceLength - 1)) michael@0: && (isSeenTailFamilyChar(dest[i+1])) ) { michael@0: dest[i] = SPACE_CHAR; michael@0: } else if (( inputChar >= 0xFE70) && (inputChar <= 0xFEF4 )) { /* FExx Arabic range */ michael@0: dest[i] = convertFEto06 [ (inputChar - 0xFE70) ]; michael@0: } else { michael@0: dest[i] = inputChar ; michael@0: } michael@0: michael@0: if( isLamAlefChar(dest[i]) ) michael@0: lamalef_found = 1; michael@0: } michael@0: michael@0: destSize = sourceLength; michael@0: if (lamalef_found != 0){ michael@0: destSize = expandCompositChar(dest,sourceLength,destSize,options,pErrorCode,DESHAPE_MODE, shapeVars); michael@0: } michael@0: return destSize; michael@0: } michael@0: michael@0: /* michael@0: **************************************** michael@0: * u_shapeArabic michael@0: **************************************** michael@0: */ michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: u_shapeArabic(const UChar *source, int32_t sourceLength, michael@0: UChar *dest, int32_t destCapacity, michael@0: uint32_t options, michael@0: UErrorCode *pErrorCode) { michael@0: michael@0: int32_t destLength; michael@0: struct uShapeVariables shapeVars = { OLD_TAIL_CHAR,U_SHAPE_LAMALEF_BEGIN,U_SHAPE_LAMALEF_END,U_SHAPE_TASHKEEL_BEGIN,U_SHAPE_TASHKEEL_END,0}; michael@0: michael@0: /* usual error checking */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { michael@0: return 0; michael@0: } michael@0: michael@0: /* make sure that no reserved options values are used; allow dest==NULL only for preflighting */ michael@0: if( source==NULL || sourceLength<-1 || (dest==NULL && destCapacity!=0) || destCapacity<0 || michael@0: (((options&U_SHAPE_TASHKEEL_MASK) > 0) && michael@0: ((options&U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) == U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) ) || michael@0: (((options&U_SHAPE_TASHKEEL_MASK) > 0) && michael@0: ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_UNSHAPE)) || michael@0: (options&U_SHAPE_DIGIT_TYPE_RESERVED)==U_SHAPE_DIGIT_TYPE_RESERVED || michael@0: (options&U_SHAPE_DIGITS_MASK)==U_SHAPE_DIGITS_RESERVED || michael@0: ((options&U_SHAPE_LAMALEF_MASK) != U_SHAPE_LAMALEF_RESIZE && michael@0: (options&U_SHAPE_AGGREGATE_TASHKEEL_MASK) != 0) || michael@0: ((options&U_SHAPE_AGGREGATE_TASHKEEL_MASK) == U_SHAPE_AGGREGATE_TASHKEEL && michael@0: (options&U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) != U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) michael@0: ) michael@0: { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: /* Validate lamalef options */ michael@0: if(((options&U_SHAPE_LAMALEF_MASK) > 0)&& michael@0: !(((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_BEGIN) || michael@0: ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_END ) || michael@0: ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_RESIZE )|| michael@0: ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_AUTO) || michael@0: ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_NEAR))) michael@0: { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: /* Validate Tashkeel options */ michael@0: if(((options&U_SHAPE_TASHKEEL_MASK) > 0)&& michael@0: !(((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_BEGIN) || michael@0: ((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_END ) michael@0: ||((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_RESIZE )|| michael@0: ((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL))) michael@0: { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: return 0; michael@0: } michael@0: /* determine the source length */ michael@0: if(sourceLength==-1) { michael@0: sourceLength=u_strlen(source); michael@0: } michael@0: if(sourceLength<=0) { michael@0: return u_terminateUChars(dest, destCapacity, 0, pErrorCode); michael@0: } michael@0: michael@0: /* check that source and destination do not overlap */ michael@0: if( dest!=NULL && michael@0: ((source<=dest && dest0) { michael@0: int32_t logical_order = (options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL; michael@0: int32_t aggregate_tashkeel = michael@0: (options&(U_SHAPE_AGGREGATE_TASHKEEL_MASK+U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED)) == michael@0: (U_SHAPE_AGGREGATE_TASHKEEL+U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED); michael@0: int step=logical_order?1:-1; michael@0: int j=logical_order?-1:2*sourceLength; michael@0: int i=logical_order?-1:sourceLength; michael@0: int end=logical_order?sourceLength:-1; michael@0: int aggregation_possible = 1; michael@0: UChar prev = 0; michael@0: UChar prevLink, currLink = 0; michael@0: int newSourceLength = 0; michael@0: tempsource = (UChar *)uprv_malloc(2*sourceLength*U_SIZEOF_UCHAR); michael@0: if(tempsource == NULL) { michael@0: *pErrorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: return 0; michael@0: } michael@0: michael@0: while ((i+=step) != end) { michael@0: prevLink = currLink; michael@0: currLink = getLink(source[i]); michael@0: if (aggregate_tashkeel && ((prevLink|currLink)&COMBINE) == COMBINE && aggregation_possible) { michael@0: aggregation_possible = 0; michael@0: tempsource[j] = (prevdestCapacity) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: if (tempsource != NULL) uprv_free(tempsource); michael@0: return outputSize; michael@0: } michael@0: michael@0: /* michael@0: * need a temporary buffer of size max(outputSize, sourceLength) michael@0: * because at first we copy source->temp michael@0: */ michael@0: if(sourceLength>outputSize) { michael@0: outputSize=sourceLength; michael@0: } michael@0: michael@0: /* Start of Arabic letter shaping part */ michael@0: if(outputSize<=LENGTHOF(buffer)) { michael@0: outputSize=LENGTHOF(buffer); michael@0: tempbuffer=buffer; michael@0: } else { michael@0: tempbuffer = (UChar *)uprv_malloc(outputSize*U_SIZEOF_UCHAR); michael@0: michael@0: /*Test for NULL*/ michael@0: if(tempbuffer == NULL) { michael@0: *pErrorCode = U_MEMORY_ALLOCATION_ERROR; michael@0: if (tempsource != NULL) uprv_free(tempsource); michael@0: return 0; michael@0: } michael@0: } michael@0: uprv_memcpy(tempbuffer, source, sourceLength*U_SIZEOF_UCHAR); michael@0: if (tempsource != NULL){ michael@0: uprv_free(tempsource); michael@0: } michael@0: michael@0: if(sourceLength 0 michael@0: && ((options&U_SHAPE_TASHKEEL_MASK) !=U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL)) { michael@0: /* Call the shaping function with tashkeel flag == 2 for removal of tashkeel */ michael@0: destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,2,shapeVars); michael@0: }else { michael@0: /* default Call the shaping function with tashkeel flag == 1 */ michael@0: destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,1,shapeVars); michael@0: michael@0: /*After shaping text check if user wants to remove tashkeel and replace it with tatweel*/ michael@0: if( (options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL){ michael@0: destLength = handleTashkeelWithTatweel(tempbuffer,destLength,destCapacity,options,pErrorCode); michael@0: } michael@0: } michael@0: break; michael@0: case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED : michael@0: /* Call the shaping function with tashkeel flag == 0 */ michael@0: destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,0,shapeVars); michael@0: break; michael@0: michael@0: case U_SHAPE_LETTERS_UNSHAPE : michael@0: /* Call the deshaping function */ michael@0: destLength = deShapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,shapeVars); michael@0: break; michael@0: default : michael@0: /* will never occur because of validity checks above */ michael@0: destLength = 0; michael@0: break; michael@0: } michael@0: michael@0: /* michael@0: * TODO: (markus 2002aug01) michael@0: * For as long as we always preflight the outputSize above michael@0: * we should U_ASSERT(outputSize==destLength) michael@0: * except for the adjustment above before the tempbuffer allocation michael@0: */ michael@0: michael@0: if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL) { michael@0: countSpaces(tempbuffer,destLength,options,&spacesCountl,&spacesCountr); michael@0: invertBuffer(tempbuffer,destLength,options,spacesCountl,spacesCountr); michael@0: } michael@0: uprv_memcpy(dest, tempbuffer, uprv_min(destLength, destCapacity)*U_SIZEOF_UCHAR); michael@0: michael@0: if(tempbuffer!=buffer) { michael@0: uprv_free(tempbuffer); michael@0: } michael@0: michael@0: if(destLength>destCapacity) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: return destLength; michael@0: } michael@0: michael@0: /* End of Arabic letter shaping part */ michael@0: } else { michael@0: /* michael@0: * No letter shaping: michael@0: * just make sure the destination is large enough and copy the string. michael@0: */ michael@0: if(destCapacity