| |
1 /* |
| |
2 ********************************************************************** |
| |
3 * Copyright (C) 2005-2009, International Business Machines |
| |
4 * Corporation and others. All Rights Reserved. |
| |
5 ********************************************************************** |
| |
6 */ |
| |
7 |
| |
8 #include "unicode/utypes.h" |
| |
9 |
| |
10 #if !UCONFIG_NO_CONVERSION |
| |
11 |
| |
12 #include "inputext.h" |
| |
13 |
| |
14 #include "cmemory.h" |
| |
15 #include "cstring.h" |
| |
16 |
| |
17 #include <string.h> |
| |
18 |
| |
19 U_NAMESPACE_BEGIN |
| |
20 |
| |
21 #define BUFFER_SIZE 8192 |
| |
22 |
| |
23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| |
24 |
| |
25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
| |
26 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| |
27 |
| |
28 InputText::InputText(UErrorCode &status) |
| |
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
| |
30 // removed if appropriate. |
| |
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. |
| |
32 // Value is percent, not absolute. |
| |
33 fDeclaredEncoding(0), |
| |
34 fRawInput(0), |
| |
35 fRawLength(0) |
| |
36 { |
| |
37 if (fInputBytes == NULL || fByteStats == NULL) { |
| |
38 status = U_MEMORY_ALLOCATION_ERROR; |
| |
39 } |
| |
40 } |
| |
41 |
| |
42 InputText::~InputText() |
| |
43 { |
| |
44 DELETE_ARRAY(fDeclaredEncoding); |
| |
45 DELETE_ARRAY(fByteStats); |
| |
46 DELETE_ARRAY(fInputBytes); |
| |
47 } |
| |
48 |
| |
49 void InputText::setText(const char *in, int32_t len) |
| |
50 { |
| |
51 fInputLen = 0; |
| |
52 fC1Bytes = FALSE; |
| |
53 fRawInput = (const uint8_t *) in; |
| |
54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
| |
55 } |
| |
56 |
| |
57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
| |
58 { |
| |
59 if(encoding) { |
| |
60 if (len == -1) { |
| |
61 len = (int32_t)uprv_strlen(encoding); |
| |
62 } |
| |
63 |
| |
64 len += 1; // to make place for the \0 at the end. |
| |
65 uprv_free(fDeclaredEncoding); |
| |
66 fDeclaredEncoding = NEW_ARRAY(char, len); |
| |
67 uprv_strncpy(fDeclaredEncoding, encoding, len); |
| |
68 } |
| |
69 } |
| |
70 |
| |
71 UBool InputText::isSet() const |
| |
72 { |
| |
73 return fRawInput != NULL; |
| |
74 } |
| |
75 |
| |
76 /** |
| |
77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess |
| |
78 * it by removing what appears to be html markup. |
| |
79 * |
| |
80 * @internal |
| |
81 */ |
| |
82 void InputText::MungeInput(UBool fStripTags) { |
| |
83 int srci = 0; |
| |
84 int dsti = 0; |
| |
85 uint8_t b; |
| |
86 bool inMarkup = FALSE; |
| |
87 int32_t openTags = 0; |
| |
88 int32_t badTags = 0; |
| |
89 |
| |
90 // |
| |
91 // html / xml markup stripping. |
| |
92 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. |
| |
93 // discard everything within < brackets > |
| |
94 // Count how many total '<' and illegal (nested) '<' occur, so we can make some |
| |
95 // guess as to whether the input was actually marked up at all. |
| |
96 // TODO: Think about how this interacts with EBCDIC charsets that are detected. |
| |
97 if (fStripTags) { |
| |
98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
| |
99 b = fRawInput[srci]; |
| |
100 |
| |
101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
| |
102 if (inMarkup) { |
| |
103 badTags += 1; |
| |
104 } |
| |
105 |
| |
106 inMarkup = TRUE; |
| |
107 openTags += 1; |
| |
108 } |
| |
109 |
| |
110 if (! inMarkup) { |
| |
111 fInputBytes[dsti++] = b; |
| |
112 } |
| |
113 |
| |
114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
| |
115 inMarkup = FALSE; |
| |
116 } |
| |
117 } |
| |
118 |
| |
119 fInputLen = dsti; |
| |
120 } |
| |
121 |
| |
122 // |
| |
123 // If it looks like this input wasn't marked up, or if it looks like it's |
| |
124 // essentially nothing but markup abandon the markup stripping. |
| |
125 // Detection will have to work on the unstripped input. |
| |
126 // |
| |
127 if (openTags<5 || openTags/5 < badTags || |
| |
128 (fInputLen < 100 && fRawLength>600)) |
| |
129 { |
| |
130 int32_t limit = fRawLength; |
| |
131 |
| |
132 if (limit > BUFFER_SIZE) { |
| |
133 limit = BUFFER_SIZE; |
| |
134 } |
| |
135 |
| |
136 for (srci=0; srci<limit; srci++) { |
| |
137 fInputBytes[srci] = fRawInput[srci]; |
| |
138 } |
| |
139 |
| |
140 fInputLen = srci; |
| |
141 } |
| |
142 |
| |
143 // |
| |
144 // Tally up the byte occurence statistics. |
| |
145 // These are available for use by the various detectors. |
| |
146 // |
| |
147 |
| |
148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
| |
149 |
| |
150 for (srci = 0; srci < fInputLen; srci += 1) { |
| |
151 fByteStats[fInputBytes[srci]] += 1; |
| |
152 } |
| |
153 |
| |
154 for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
| |
155 if (fByteStats[i] != 0) { |
| |
156 fC1Bytes = TRUE; |
| |
157 break; |
| |
158 } |
| |
159 } |
| |
160 } |
| |
161 |
| |
162 U_NAMESPACE_END |
| |
163 #endif |
| |
164 |