|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2005-2009, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #if !UCONFIG_NO_CONVERSION |
|
11 |
|
12 #include "inputext.h" |
|
13 |
|
14 #include "cmemory.h" |
|
15 #include "cstring.h" |
|
16 |
|
17 #include <string.h> |
|
18 |
|
19 U_NAMESPACE_BEGIN |
|
20 |
|
21 #define BUFFER_SIZE 8192 |
|
22 |
|
23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
|
24 |
|
25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
|
26 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
|
27 |
|
28 InputText::InputText(UErrorCode &status) |
|
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
|
30 // removed if appropriate. |
|
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. |
|
32 // Value is percent, not absolute. |
|
33 fDeclaredEncoding(0), |
|
34 fRawInput(0), |
|
35 fRawLength(0) |
|
36 { |
|
37 if (fInputBytes == NULL || fByteStats == NULL) { |
|
38 status = U_MEMORY_ALLOCATION_ERROR; |
|
39 } |
|
40 } |
|
41 |
|
42 InputText::~InputText() |
|
43 { |
|
44 DELETE_ARRAY(fDeclaredEncoding); |
|
45 DELETE_ARRAY(fByteStats); |
|
46 DELETE_ARRAY(fInputBytes); |
|
47 } |
|
48 |
|
49 void InputText::setText(const char *in, int32_t len) |
|
50 { |
|
51 fInputLen = 0; |
|
52 fC1Bytes = FALSE; |
|
53 fRawInput = (const uint8_t *) in; |
|
54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
|
55 } |
|
56 |
|
57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
|
58 { |
|
59 if(encoding) { |
|
60 if (len == -1) { |
|
61 len = (int32_t)uprv_strlen(encoding); |
|
62 } |
|
63 |
|
64 len += 1; // to make place for the \0 at the end. |
|
65 uprv_free(fDeclaredEncoding); |
|
66 fDeclaredEncoding = NEW_ARRAY(char, len); |
|
67 uprv_strncpy(fDeclaredEncoding, encoding, len); |
|
68 } |
|
69 } |
|
70 |
|
71 UBool InputText::isSet() const |
|
72 { |
|
73 return fRawInput != NULL; |
|
74 } |
|
75 |
|
76 /** |
|
77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess |
|
78 * it by removing what appears to be html markup. |
|
79 * |
|
80 * @internal |
|
81 */ |
|
82 void InputText::MungeInput(UBool fStripTags) { |
|
83 int srci = 0; |
|
84 int dsti = 0; |
|
85 uint8_t b; |
|
86 bool inMarkup = FALSE; |
|
87 int32_t openTags = 0; |
|
88 int32_t badTags = 0; |
|
89 |
|
90 // |
|
91 // html / xml markup stripping. |
|
92 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. |
|
93 // discard everything within < brackets > |
|
94 // Count how many total '<' and illegal (nested) '<' occur, so we can make some |
|
95 // guess as to whether the input was actually marked up at all. |
|
96 // TODO: Think about how this interacts with EBCDIC charsets that are detected. |
|
97 if (fStripTags) { |
|
98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
|
99 b = fRawInput[srci]; |
|
100 |
|
101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
|
102 if (inMarkup) { |
|
103 badTags += 1; |
|
104 } |
|
105 |
|
106 inMarkup = TRUE; |
|
107 openTags += 1; |
|
108 } |
|
109 |
|
110 if (! inMarkup) { |
|
111 fInputBytes[dsti++] = b; |
|
112 } |
|
113 |
|
114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
|
115 inMarkup = FALSE; |
|
116 } |
|
117 } |
|
118 |
|
119 fInputLen = dsti; |
|
120 } |
|
121 |
|
122 // |
|
123 // If it looks like this input wasn't marked up, or if it looks like it's |
|
124 // essentially nothing but markup abandon the markup stripping. |
|
125 // Detection will have to work on the unstripped input. |
|
126 // |
|
127 if (openTags<5 || openTags/5 < badTags || |
|
128 (fInputLen < 100 && fRawLength>600)) |
|
129 { |
|
130 int32_t limit = fRawLength; |
|
131 |
|
132 if (limit > BUFFER_SIZE) { |
|
133 limit = BUFFER_SIZE; |
|
134 } |
|
135 |
|
136 for (srci=0; srci<limit; srci++) { |
|
137 fInputBytes[srci] = fRawInput[srci]; |
|
138 } |
|
139 |
|
140 fInputLen = srci; |
|
141 } |
|
142 |
|
143 // |
|
144 // Tally up the byte occurence statistics. |
|
145 // These are available for use by the various detectors. |
|
146 // |
|
147 |
|
148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
|
149 |
|
150 for (srci = 0; srci < fInputLen; srci += 1) { |
|
151 fByteStats[fInputBytes[srci]] += 1; |
|
152 } |
|
153 |
|
154 for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
|
155 if (fByteStats[i] != 0) { |
|
156 fC1Bytes = TRUE; |
|
157 break; |
|
158 } |
|
159 } |
|
160 } |
|
161 |
|
162 U_NAMESPACE_END |
|
163 #endif |
|
164 |