michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // Updated 2014.01 for dual table lookup michael@0: // michael@0: michael@0: #include "scoreonescriptspan.h" michael@0: michael@0: #include "cldutil.h" michael@0: #include "debug.h" michael@0: #include "lang_script.h" michael@0: michael@0: #include michael@0: michael@0: using namespace std; michael@0: michael@0: namespace CLD2 { michael@0: michael@0: static const int kUnreliablePercentThreshold = 75; michael@0: michael@0: void AddLangProb(uint32 langprob, Tote* chunk_tote) { michael@0: ProcessProbV2Tote(langprob, chunk_tote); michael@0: } michael@0: michael@0: void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { michael@0: uint8 top1 = (langprob >> 8) & 0xff; michael@0: chunk_tote->SetScore(top1, 0); michael@0: } michael@0: michael@0: bool SameCloseSet(uint16 lang1, uint16 lang2) { michael@0: int lang1_close_set = LanguageCloseSet(static_cast(lang1)); michael@0: if (lang1_close_set == 0) {return false;} michael@0: int lang2_close_set = LanguageCloseSet(static_cast(lang2)); michael@0: return (lang1_close_set == lang2_close_set); michael@0: } michael@0: michael@0: bool SameCloseSet(Language lang1, Language lang2) { michael@0: int lang1_close_set = LanguageCloseSet(lang1); michael@0: if (lang1_close_set == 0) {return false;} michael@0: int lang2_close_set = LanguageCloseSet(lang2); michael@0: return (lang1_close_set == lang2_close_set); michael@0: } michael@0: michael@0: michael@0: // Needs expected score per 1KB in scoring context michael@0: void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, michael@0: int offset, int len, michael@0: const ScoringContext* scoringcontext, michael@0: const Tote* chunk_tote, michael@0: ChunkSummary* chunksummary) { michael@0: int key3[3]; michael@0: chunk_tote->CurrentTopThreeKeys(key3); michael@0: Language lang1 = FromPerScriptNumber(ulscript, key3[0]); michael@0: Language lang2 = FromPerScriptNumber(ulscript, key3[1]); michael@0: michael@0: int actual_score_per_kb = 0; michael@0: if (len > 0) { michael@0: actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; michael@0: } michael@0: int expected_subscr = lang1 * 4 + LScript4(ulscript); michael@0: int expected_score_per_kb = michael@0: scoringcontext->scoringtables->kExpectedScore[expected_subscr]; michael@0: michael@0: chunksummary->offset = offset; michael@0: chunksummary->chunk_start = first_linear_in_chunk; michael@0: chunksummary->lang1 = lang1; michael@0: chunksummary->lang2 = lang2; michael@0: chunksummary->score1 = chunk_tote->GetScore(key3[0]); michael@0: chunksummary->score2 = chunk_tote->GetScore(key3[1]); michael@0: chunksummary->bytes = len; michael@0: chunksummary->grams = chunk_tote->GetScoreCount(); michael@0: chunksummary->ulscript = ulscript; michael@0: chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, michael@0: chunksummary->score2, michael@0: chunksummary->grams); michael@0: // If lang1/lang2 in same close set, set delta reliability to 100% michael@0: if (SameCloseSet(lang1, lang2)) { michael@0: chunksummary->reliability_delta = 100; michael@0: } michael@0: chunksummary->reliability_score = michael@0: ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); michael@0: } michael@0: michael@0: // Return true if just lang1 is there: lang2=0 and lang3=0 michael@0: bool IsSingleLang(uint32 langprob) { michael@0: // Probably a bug -- which end is lang1? But only used to call empty Boost1 michael@0: return ((langprob & 0x00ffff00) == 0); michael@0: } michael@0: michael@0: // Update scoring context distinct_boost for single language quad michael@0: void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { michael@0: // Probably keep this empty -- not a good enough signal michael@0: } michael@0: michael@0: // Update scoring context distinct_boost for distinct octagram michael@0: // Keep last 4 used. Since these are mostly (except at splices) in michael@0: // hitbuffer, we might be able to just use a subscript and splice michael@0: void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { michael@0: // this is called 0..n times per chunk with decoded hitbuffer->distinct... michael@0: LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; michael@0: if (scoringcontext->ulscript != ULScript_Latin) { michael@0: distinct_boost = &scoringcontext->distinct_boost.othr; michael@0: } michael@0: int n = distinct_boost->n; michael@0: distinct_boost->langprob[n] = langprob; michael@0: distinct_boost->n = distinct_boost->wrap(n + 1); michael@0: } michael@0: michael@0: // For each chunk, add extra weight for language priors (from content-lang and michael@0: // meta lang=xx) and distinctive tokens michael@0: void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { michael@0: // Get boosts for current script michael@0: const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; michael@0: const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; michael@0: const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; michael@0: if (scoringcontext->ulscript != ULScript_Latin) { michael@0: langprior_boost = &scoringcontext->langprior_boost.othr; michael@0: langprior_whack = &scoringcontext->langprior_whack.othr; michael@0: distinct_boost = &scoringcontext->distinct_boost.othr; michael@0: } michael@0: michael@0: for (int k = 0; k < kMaxBoosts; ++k) { michael@0: uint32 langprob = langprior_boost->langprob[k]; michael@0: if (langprob > 0) {AddLangProb(langprob, chunk_tote);} michael@0: } michael@0: for (int k = 0; k < kMaxBoosts; ++k) { michael@0: uint32 langprob = distinct_boost->langprob[k]; michael@0: if (langprob > 0) {AddLangProb(langprob, chunk_tote);} michael@0: } michael@0: // boost has a packed set of per-script langs and probabilites michael@0: // whack has a packed set of per-script lang to be suppressed (zeroed) michael@0: // When a language in a close set is given as an explicit hint, others in michael@0: // that set will be whacked here. michael@0: for (int k = 0; k < kMaxBoosts; ++k) { michael@0: uint32 langprob = langprior_whack->langprob[k]; michael@0: if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: // At this point, The chunk is described by michael@0: // hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) michael@0: // hitbuffer->delta[cspan->chunk_delta ... ) michael@0: // hitbuffer->distinct[cspan->chunk_distinct ... ) michael@0: // Scored text is in text[lo..hi) where michael@0: // lo is 0 or the min of first base/delta/distinct hitbuffer offset and michael@0: // hi is the min of next base/delta/distinct hitbuffer offset after michael@0: // base_len, etc. michael@0: void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, michael@0: const ChunkSpan* cspan, int* lo, int* hi) { michael@0: // Front of this span michael@0: int lo_base = hitbuffer->base[cspan->chunk_base].offset; michael@0: int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; michael@0: int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; michael@0: // Front of next span michael@0: int hi_base = hitbuffer->base[cspan->chunk_base + michael@0: cspan->base_len].offset; michael@0: int hi_delta = hitbuffer->delta[cspan->chunk_delta + michael@0: cspan->delta_len].offset; michael@0: int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + michael@0: cspan->distinct_len].offset; michael@0: michael@0: *lo = 0; michael@0: // if (cspan->chunk_base > 0) { michael@0: // *lo = minint(minint(lo_base, lo_delta), lo_distinct); michael@0: // } michael@0: *lo = minint(minint(lo_base, lo_delta), lo_distinct); michael@0: *hi = minint(minint(hi_base, hi_delta), hi_distinct); michael@0: } michael@0: michael@0: michael@0: int DiffScore(const CLD2TableSummary* obj, int indirect, michael@0: uint16 lang1, uint16 lang2) { michael@0: if (indirect < static_cast(obj->kCLDTableSizeOne)) { michael@0: // Up to three languages at indirect michael@0: uint32 langprob = obj->kCLDTableInd[indirect]; michael@0: return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); michael@0: } else { michael@0: // Up to six languages at start + 2 * (indirect - start) michael@0: indirect += (indirect - obj->kCLDTableSizeOne); michael@0: uint32 langprob = obj->kCLDTableInd[indirect]; michael@0: uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; michael@0: return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - michael@0: (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); michael@0: } michael@0: michael@0: } michael@0: michael@0: // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote michael@0: // After last chunk there is always a hitbuffer entry with an offset just off michael@0: // the end of the text. michael@0: // Sets delta_len, and distinct_len michael@0: void ScoreOneChunk(const char* text, ULScript ulscript, michael@0: const ScoringHitBuffer* hitbuffer, michael@0: int chunk_i, michael@0: ScoringContext* scoringcontext, michael@0: ChunkSpan* cspan, Tote* chunk_tote, michael@0: ChunkSummary* chunksummary) { michael@0: int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; michael@0: int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; michael@0: michael@0: chunk_tote->Reinit(); michael@0: cspan->delta_len = 0; michael@0: cspan->distinct_len = 0; michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, "
ScoreOneChunk[%d..%d) ", michael@0: first_linear_in_chunk, first_linear_in_next_chunk); michael@0: } michael@0: michael@0: // 2013.02.05 linear design: just use base and base_len for the span michael@0: cspan->chunk_base = first_linear_in_chunk; michael@0: cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; michael@0: for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { michael@0: uint32 langprob = hitbuffer->linear[i].langprob; michael@0: AddLangProb(langprob, chunk_tote); michael@0: if (hitbuffer->linear[i].type <= QUADHIT) { michael@0: chunk_tote->AddScoreCount(); // Just count quads, not octas michael@0: } michael@0: if (hitbuffer->linear[i].type == DISTINCTHIT) { michael@0: AddDistinctBoost2(langprob, scoringcontext); michael@0: } michael@0: } michael@0: michael@0: // Score language prior boosts michael@0: // Score distinct word boost michael@0: ScoreBoosts(scoringcontext, chunk_tote); michael@0: michael@0: int lo = hitbuffer->linear[first_linear_in_chunk].offset; michael@0: int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; michael@0: michael@0: // Chunk_tote: get top langs, scores, etc. and fill in chunk summary michael@0: SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, michael@0: scoringcontext, chunk_tote, chunksummary); michael@0: michael@0: bool more_to_come = false; michael@0: bool score_cjk = false; michael@0: if (scoringcontext->flags_cld2_html) { michael@0: // Show one chunk in readable output michael@0: CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, michael@0: scoringcontext, cspan, chunksummary); michael@0: } michael@0: michael@0: scoringcontext->prior_chunk_lang = static_cast(chunksummary->lang1); michael@0: } michael@0: michael@0: michael@0: // Score chunks of text described by hitbuffer, allowing each to be in a michael@0: // different language, and optionally adjusting the boundaries inbetween. michael@0: // Set last_cspan to the last chunkspan used michael@0: void ScoreAllHits(const char* text, ULScript ulscript, michael@0: bool more_to_come, bool score_cjk, michael@0: const ScoringHitBuffer* hitbuffer, michael@0: ScoringContext* scoringcontext, michael@0: SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { michael@0: ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; michael@0: ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; michael@0: michael@0: for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { michael@0: // Score one chunk michael@0: // Sets delta_len, and distinct_len michael@0: Tote chunk_tote; michael@0: ChunkSummary chunksummary; michael@0: ScoreOneChunk(text, ulscript, michael@0: hitbuffer, i, michael@0: scoringcontext, &cspan, &chunk_tote, &chunksummary); michael@0: michael@0: // Put result in summarybuffer michael@0: if (summarybuffer->n < kMaxSummaries) { michael@0: summarybuffer->chunksummary[summarybuffer->n] = chunksummary; michael@0: summarybuffer->n += 1; michael@0: } michael@0: michael@0: prior_cspan = cspan; michael@0: cspan.chunk_base += cspan.base_len; michael@0: cspan.chunk_delta += cspan.delta_len; michael@0: cspan.chunk_distinct += cspan.distinct_len; michael@0: } michael@0: michael@0: // Add one dummy off the end to hold first unused linear_in_chunk michael@0: int linear_off_end = hitbuffer->next_linear; michael@0: int offset_off_end = hitbuffer->linear[linear_off_end].offset; michael@0: ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; michael@0: memset(cs, 0, sizeof(ChunkSummary)); michael@0: cs->offset = offset_off_end; michael@0: cs->chunk_start = linear_off_end; michael@0: *last_cspan = prior_cspan; michael@0: } michael@0: michael@0: michael@0: void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, michael@0: bool more_to_come, DocTote* doc_tote) { michael@0: int cs_bytes_sum = 0; michael@0: for (int i = 0; i < summarybuffer->n; ++i) { michael@0: const ChunkSummary* cs = &summarybuffer->chunksummary[i]; michael@0: int reliability = minint(cs->reliability_delta, cs->reliability_score); michael@0: // doc_tote uses full languages michael@0: doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); michael@0: cs_bytes_sum += cs->bytes; michael@0: } michael@0: } michael@0: michael@0: // Turn on for debugging vectors michael@0: static const bool kShowLettersOriginal = false; michael@0: michael@0: michael@0: // If next chunk language matches last vector language, extend last element michael@0: // Otherwise add new element to vector michael@0: void ItemToVector(ScriptScanner* scanner, michael@0: ResultChunkVector* vec, Language new_lang, michael@0: int mapped_offset, int mapped_len) { michael@0: uint16 last_vec_lang = static_cast(UNKNOWN_LANGUAGE); michael@0: int last_vec_subscr = vec->size() - 1; michael@0: if (last_vec_subscr >= 0) { michael@0: ResultChunk* priorrc = &(*vec)[last_vec_subscr]; michael@0: last_vec_lang = priorrc->lang1; michael@0: if (new_lang == last_vec_lang) { michael@0: // Extend prior. Current mapped_offset may be beyond prior end, so do michael@0: // the arithmetic to include any such gap michael@0: priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, michael@0: kMaxResultChunkBytes); michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the new chunk original text michael@0: string temp2(&scanner->GetBufferStart()[priorrc->offset], michael@0: priorrc->bytes); michael@0: fprintf(stderr, "Item[%d..%d) '%s'
\n", michael@0: priorrc->offset, priorrc->offset + priorrc->bytes, michael@0: GetHtmlEscapedText(temp2).c_str()); michael@0: } michael@0: return; michael@0: } michael@0: } michael@0: // Add new vector element michael@0: ResultChunk rc; michael@0: rc.offset = mapped_offset; michael@0: rc.bytes = minint(mapped_len, kMaxResultChunkBytes); michael@0: rc.lang1 = static_cast(new_lang); michael@0: vec->push_back(rc); michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the new chunk original text michael@0: string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); michael@0: fprintf(stderr, "Item[%d..%d) '%s'
\n", michael@0: rc.offset, rc.offset + rc.bytes, michael@0: GetHtmlEscapedText(temp2).c_str()); michael@0: } michael@0: } michael@0: michael@0: uint16 PriorVecLang(const ResultChunkVector* vec) { michael@0: if (vec->empty()) {return static_cast(UNKNOWN_LANGUAGE);} michael@0: return (*vec)[vec->size() - 1].lang1; michael@0: } michael@0: michael@0: uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { michael@0: if ((i + 1) >= summarybuffer->n) { michael@0: return static_cast(UNKNOWN_LANGUAGE); michael@0: } michael@0: return summarybuffer->chunksummary[i + 1].lang1; michael@0: } michael@0: michael@0: michael@0: michael@0: // Add n elements of summarybuffer to resultchunk vector: michael@0: // Each element is letters-only text [offset..offset+bytes) michael@0: // This maps back to original[Back(offset)..Back(offset+bytes)) michael@0: // michael@0: // We go out of our way to minimize the variation in the ResultChunkVector, michael@0: // so that the caller has fewer but more meaningful spans in different michael@0: // lanaguges, for the likely purpose of translation or spell-check. michael@0: // michael@0: // The language of each chunk is lang1, but it might be unreliable for michael@0: // either of two reasons: its score is relatively too close to the score of michael@0: // lang2, or its score is too far away from the expected score of real text in michael@0: // the given language. Unreliable languages are mapped to Unknown. michael@0: // michael@0: void SummaryBufferToVector(ScriptScanner* scanner, const char* text, michael@0: const SummaryBuffer* summarybuffer, michael@0: bool more_to_come, ResultChunkVector* vec) { michael@0: if (vec == NULL) {return;} michael@0: michael@0: if (kShowLettersOriginal) { michael@0: fprintf(stderr, "map2original_ "); michael@0: scanner->map2original_.DumpWindow(); michael@0: fprintf(stderr, "
\n"); michael@0: fprintf(stderr, "map2uplow_ "); michael@0: scanner->map2uplow_.DumpWindow(); michael@0: fprintf(stderr, "
\n"); michael@0: } michael@0: michael@0: for (int i = 0; i < summarybuffer->n; ++i) { michael@0: const ChunkSummary* cs = &summarybuffer->chunksummary[i]; michael@0: int unmapped_offset = cs->offset; michael@0: int unmapped_len = cs->bytes; michael@0: michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the chunk lowercase letters/marks text michael@0: string temp(&text[unmapped_offset], unmapped_len); michael@0: fprintf(stderr, "Letters [%d..%d) '%s'
\n", michael@0: unmapped_offset, unmapped_offset + unmapped_len, michael@0: GetHtmlEscapedText(temp).c_str()); michael@0: } michael@0: michael@0: int mapped_offset = scanner->MapBack(unmapped_offset); michael@0: michael@0: // Trim back a little to prefer splicing original at word boundaries michael@0: if (mapped_offset > 0) { michael@0: // Size of prior vector entry, if any michael@0: int prior_size = 0; michael@0: if (!vec->empty()) { michael@0: ResultChunk* rc = &(*vec)[vec->size() - 1]; michael@0: prior_size = rc->bytes; michael@0: } michael@0: // Maximum back up size to leave at least 3 bytes in prior, michael@0: // and not entire buffer, and no more than 12 bytes total backup michael@0: int n_limit = minint(prior_size - 3, mapped_offset); michael@0: n_limit = minint(n_limit, 12); michael@0: michael@0: // Backscan over letters, stopping if prior byte is < 0x41 michael@0: // There is some possibility that we will backscan over a different script michael@0: const char* s = &scanner->GetBufferStart()[mapped_offset]; michael@0: const unsigned char* us = reinterpret_cast(s); michael@0: int n = 0; michael@0: while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} michael@0: if (n >= n_limit) {n = 0;} // New boundary not found within range michael@0: michael@0: // Also back up exactly one leading punctuation character if '"#@ michael@0: if (n < n_limit) { michael@0: unsigned char c = us[-n - 1]; michael@0: if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} michael@0: } michael@0: // Shrink the previous chunk slightly michael@0: if (n > 0) { michael@0: ResultChunk* rc = &(*vec)[vec->size() - 1]; michael@0: rc->bytes -= n; michael@0: mapped_offset -= n; michael@0: if (kShowLettersOriginal) { michael@0: fprintf(stderr, "Back up %d bytes
\n", n); michael@0: // Optionally print the prior chunk original text michael@0: string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); michael@0: fprintf(stderr, "Prior [%d..%d) '%s'
\n", michael@0: rc->offset, rc->offset + rc->bytes, michael@0: GetHtmlEscapedText(temp2).c_str()); michael@0: } michael@0: } michael@0: } michael@0: michael@0: int mapped_len = michael@0: scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; michael@0: michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the chunk original text michael@0: string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); michael@0: fprintf(stderr, "Original[%d..%d) '%s'
\n", michael@0: mapped_offset, mapped_offset + mapped_len, michael@0: GetHtmlEscapedText(temp2).c_str()); michael@0: } michael@0: michael@0: Language new_lang = static_cast(cs->lang1); michael@0: bool reliability_delta_bad = michael@0: (cs->reliability_delta < kUnreliablePercentThreshold); michael@0: bool reliability_score_bad = michael@0: (cs->reliability_score < kUnreliablePercentThreshold); michael@0: michael@0: // If the top language matches last vector, ignore reliability_delta michael@0: uint16 prior_lang = PriorVecLang(vec); michael@0: if (prior_lang == cs->lang1) { michael@0: reliability_delta_bad = false; michael@0: } michael@0: // If the top language is in same close set as last vector, set up to merge michael@0: if (SameCloseSet(cs->lang1, prior_lang)) { michael@0: new_lang = static_cast(prior_lang); michael@0: reliability_delta_bad = false; michael@0: } michael@0: // If the top two languages are in the same close set and the last vector michael@0: // language is the second language, set up to merge michael@0: if (SameCloseSet(cs->lang1, cs->lang2) && michael@0: (prior_lang == cs->lang2)) { michael@0: new_lang = static_cast(prior_lang); michael@0: reliability_delta_bad = false; michael@0: } michael@0: // If unreliable and the last and next vector languages are both michael@0: // the second language, set up to merge michael@0: uint16 next_lang = NextChunkLang(summarybuffer, i); michael@0: if (reliability_delta_bad && michael@0: (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { michael@0: new_lang = static_cast(prior_lang); michael@0: reliability_delta_bad = false; michael@0: } michael@0: michael@0: if (reliability_delta_bad || reliability_score_bad) { michael@0: new_lang = UNKNOWN_LANGUAGE; michael@0: } michael@0: ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); michael@0: } michael@0: } michael@0: michael@0: // Add just one element to resultchunk vector: michael@0: // For RTypeNone or RTypeOne michael@0: void JustOneItemToVector(ScriptScanner* scanner, const char* text, michael@0: Language lang1, int unmapped_offset, int unmapped_len, michael@0: ResultChunkVector* vec) { michael@0: if (vec == NULL) {return;} michael@0: michael@0: if (kShowLettersOriginal) { michael@0: fprintf(stderr, "map2original_ "); michael@0: scanner->map2original_.DumpWindow(); michael@0: fprintf(stderr, "
\n"); michael@0: fprintf(stderr, "map2uplow_ "); michael@0: scanner->map2uplow_.DumpWindow(); michael@0: fprintf(stderr, "
\n"); michael@0: } michael@0: michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the chunk lowercase letters/marks text michael@0: string temp(&text[unmapped_offset], unmapped_len); michael@0: fprintf(stderr, "Letters1 [%d..%d) '%s'
\n", michael@0: unmapped_offset, unmapped_offset + unmapped_len, michael@0: GetHtmlEscapedText(temp).c_str()); michael@0: } michael@0: michael@0: int mapped_offset = scanner->MapBack(unmapped_offset); michael@0: int mapped_len = michael@0: scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; michael@0: michael@0: if (kShowLettersOriginal) { michael@0: // Optionally print the chunk original text michael@0: string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); michael@0: fprintf(stderr, "Original1[%d..%d) '%s'
\n", michael@0: mapped_offset, mapped_offset + mapped_len, michael@0: GetHtmlEscapedText(temp2).c_str()); michael@0: } michael@0: michael@0: ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); michael@0: } michael@0: michael@0: michael@0: // Debugging. Not thread safe. Defined in getonescriptspan michael@0: char* DisplayPiece(const char* next_byte_, int byte_length_); michael@0: michael@0: // If high bit is on, take out high bit and add 2B to make table2 entries easy michael@0: inline int PrintableIndirect(int x) { michael@0: if ((x & 0x80000000u) != 0) { michael@0: return (x & ~0x80000000u) + 2000000000; michael@0: } michael@0: return x; michael@0: } michael@0: void DumpHitBuffer(FILE* df, const char* text, michael@0: const ScoringHitBuffer* hitbuffer) { michael@0: fprintf(df, michael@0: "
DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)
\n", michael@0: ULScriptCode(hitbuffer->ulscript), michael@0: hitbuffer->next_base, hitbuffer->next_delta, michael@0: hitbuffer->next_distinct); michael@0: for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { michael@0: if (i < hitbuffer->next_base) { michael@0: fprintf(df, "Q[%d]%d,%d,%s ", michael@0: i, hitbuffer->base[i].offset, michael@0: PrintableIndirect(hitbuffer->base[i].indirect), michael@0: DisplayPiece(&text[hitbuffer->base[i].offset], 6)); michael@0: } michael@0: if (i < hitbuffer->next_delta) { michael@0: fprintf(df, "DL[%d]%d,%d,%s ", michael@0: i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, michael@0: DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); michael@0: } michael@0: if (i < hitbuffer->next_distinct) { michael@0: fprintf(df, "D[%d]%d,%d,%s ", michael@0: i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, michael@0: DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); michael@0: } michael@0: if (i < hitbuffer->next_base) { michael@0: fprintf(df, "
\n"); michael@0: } michael@0: if (i > 50) {break;} michael@0: } michael@0: if (hitbuffer->next_base > 50) { michael@0: int i = hitbuffer->next_base; michael@0: fprintf(df, "Q[%d]%d,%d,%s ", michael@0: i, hitbuffer->base[i].offset, michael@0: PrintableIndirect(hitbuffer->base[i].indirect), michael@0: DisplayPiece(&text[hitbuffer->base[i].offset], 6)); michael@0: } michael@0: if (hitbuffer->next_delta > 50) { michael@0: int i = hitbuffer->next_delta; michael@0: fprintf(df, "DL[%d]%d,%d,%s ", michael@0: i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, michael@0: DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); michael@0: } michael@0: if (hitbuffer->next_distinct > 50) { michael@0: int i = hitbuffer->next_distinct; michael@0: fprintf(df, "D[%d]%d,%d,%s ", michael@0: i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, michael@0: DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); michael@0: } michael@0: fprintf(df, "
\n"); michael@0: } michael@0: michael@0: michael@0: void DumpLinearBuffer(FILE* df, const char* text, michael@0: const ScoringHitBuffer* hitbuffer) { michael@0: fprintf(df, "
DumpLinearBuffer[%d)
\n", michael@0: hitbuffer->next_linear); michael@0: // Include the dummy entry off the end michael@0: for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { michael@0: if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} michael@0: fprintf(df, "[%d]%d,%c=%08x,%s
\n", michael@0: i, hitbuffer->linear[i].offset, michael@0: "UQLD"[hitbuffer->linear[i].type], michael@0: hitbuffer->linear[i].langprob, michael@0: DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); michael@0: } michael@0: fprintf(df, "
\n"); michael@0: michael@0: fprintf(df, "DumpChunkStart[%d]
\n", hitbuffer->next_chunk_start); michael@0: for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { michael@0: fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); michael@0: } michael@0: fprintf(df, "
\n"); michael@0: } michael@0: michael@0: // Move this verbose debugging output to debug.cc eventually michael@0: void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { michael@0: // Print chunksummary michael@0: fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs
\n", michael@0: cs->offset, michael@0: cs->chunk_start, michael@0: LanguageCode(static_cast(cs->lang1)), michael@0: cs->score1, michael@0: LanguageCode(static_cast(cs->lang2)), michael@0: cs->score2, michael@0: cs->bytes, michael@0: cs->grams, michael@0: ULScriptCode(static_cast(cs->ulscript)), michael@0: cs->reliability_delta, michael@0: cs->reliability_score); michael@0: } michael@0: michael@0: void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { michael@0: fprintf(df, "
DumpSummaryBuffer[%d]
\n", summarybuffer->n); michael@0: fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " michael@0: "bytesB ngrams# script rel_delta rel_score
\n"); michael@0: for (int i = 0; i <= summarybuffer->n; ++i) { michael@0: fprintf(df, "[%d] ", i); michael@0: DumpChunkSummary(df, &summarybuffer->chunksummary[i]); michael@0: } michael@0: fprintf(df, "
\n"); michael@0: } michael@0: michael@0: michael@0: michael@0: // Within hitbufer->linear[] michael@0: // <-- prior chunk --><-- this chunk --> michael@0: // | | | michael@0: // linear0 linear1 linear2 michael@0: // lang0 lang1 michael@0: // The goal of sharpening is to move this_linear to better separate langs michael@0: int BetterBoundary(const char* text, michael@0: ScoringHitBuffer* hitbuffer, michael@0: ScoringContext* scoringcontext, michael@0: uint16 pslang0, uint16 pslang1, michael@0: int linear0, int linear1, int linear2) { michael@0: // Degenerate case, no change michael@0: if ((linear2 - linear0) <= 8) {return linear1;} michael@0: michael@0: // Each diff gives pslang0 score - pslang1 score michael@0: // Running diff has four entries + + + + followed by four entries - - - - michael@0: // so that this value is maximal at the sharpest boundary between pslang0 michael@0: // (positive diffs) and pslang1 (negative diffs) michael@0: int running_diff = 0; michael@0: int diff[8]; // Ring buffer of pslang0-pslang1 differences michael@0: // Initialize with first 8 diffs michael@0: for (int i = linear0; i < linear0 + 8; ++i) { michael@0: int j = i & 7; michael@0: uint32 langprob = hitbuffer->linear[i].langprob; michael@0: diff[j] = GetLangScore(langprob, pslang0) - michael@0: GetLangScore(langprob, pslang1); michael@0: if (i < linear0 + 4) { michael@0: // First four diffs pslang0 - pslang1 michael@0: running_diff += diff[j]; michael@0: } else { michael@0: // Second four diffs -(pslang0 - pslang1) michael@0: running_diff -= diff[j]; michael@0: } michael@0: } michael@0: michael@0: // Now scan for sharpest boundary. j is at left end of 8 entries michael@0: // To be a boundary, there must be both >0 and <0 entries in the window michael@0: int better_boundary_value = 0; michael@0: int better_boundary = linear1; michael@0: for (int i = linear0; i < linear2 - 8; ++i) { michael@0: int j = i & 7; michael@0: if (better_boundary_value < running_diff) { michael@0: bool has_plus = false; michael@0: bool has_minus = false; michael@0: for (int kk = 0; kk < 8; ++kk) { michael@0: if (diff[kk] > 0) {has_plus = true;} michael@0: if (diff[kk] < 0) {has_minus = true;} michael@0: } michael@0: if (has_plus && has_minus) { michael@0: better_boundary_value = running_diff; michael@0: better_boundary = i + 4; michael@0: } michael@0: } michael@0: // Shift right one entry michael@0: uint32 langprob = hitbuffer->linear[i + 8].langprob; michael@0: int newdiff = GetLangScore(langprob, pslang0) - michael@0: GetLangScore(langprob, pslang1); michael@0: int middiff = diff[(i + 4) & 7]; michael@0: int olddiff = diff[j]; michael@0: diff[j] = newdiff; michael@0: running_diff -= olddiff; // Remove left michael@0: running_diff += 2 * middiff; // Convert middle from - to + michael@0: running_diff -= newdiff; // Insert right michael@0: } michael@0: michael@0: if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { michael@0: Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); michael@0: Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); michael@0: fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s
\n", michael@0: linear1, better_boundary, michael@0: LanguageCode(lang0), LanguageCode(lang1)); michael@0: int lin0_off = hitbuffer->linear[linear0].offset; michael@0: int lin1_off = hitbuffer->linear[linear1].offset; michael@0: int lin2_off = hitbuffer->linear[linear2].offset; michael@0: int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; michael@0: int better_off = hitbuffer->linear[better_boundary].offset; michael@0: int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; michael@0: string old0(&text[lin0_off], lin1_off - lin0_off); michael@0: string old1(&text[lin1_off], lin2_off - lin1_off); michael@0: string new0(&text[lin0_off], better_offm1 - lin0_off); michael@0: string new0m1(&text[better_offm1], better_off - better_offm1); michael@0: string new1(&text[better_off], better_offp1 - better_off); michael@0: string new1p1(&text[better_offp1], lin2_off - better_offp1); michael@0: fprintf(scoringcontext->debug_file, "%s^^%s =>
\n%s^%s^^%s^%s
\n", michael@0: GetHtmlEscapedText(old0).c_str(), michael@0: GetHtmlEscapedText(old1).c_str(), michael@0: GetHtmlEscapedText(new0).c_str(), michael@0: GetHtmlEscapedText(new0m1).c_str(), michael@0: GetHtmlEscapedText(new1).c_str(), michael@0: GetHtmlEscapedText(new1p1).c_str()); michael@0: // Slow picture of differences per linear entry michael@0: int d; michael@0: for (int i = linear0; i < linear2; ++i) { michael@0: if (i == better_boundary) { michael@0: fprintf(scoringcontext->debug_file, "^^ "); michael@0: } michael@0: uint32 langprob = hitbuffer->linear[i].langprob; michael@0: d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); michael@0: const char* s = "="; michael@0: //if (d > 2) {s = "\xc2\xaf";} // Macron michael@0: if (d > 2) {s = "#";} michael@0: else if (d > 0) {s = "+";} michael@0: else if (d < -2) {s = "_";} michael@0: else if (d < 0) {s = "-";} michael@0: fprintf(scoringcontext->debug_file, "%s ", s); michael@0: } michael@0: fprintf(scoringcontext->debug_file, "   (scale: #+=-_)
\n"); michael@0: } michael@0: return better_boundary; michael@0: } michael@0: michael@0: michael@0: // For all but the first summary, if its top language differs from michael@0: // the previous chunk, refine the boundary michael@0: // Linearized version michael@0: void SharpenBoundaries(const char* text, michael@0: bool more_to_come, michael@0: ScoringHitBuffer* hitbuffer, michael@0: ScoringContext* scoringcontext, michael@0: SummaryBuffer* summarybuffer) { michael@0: michael@0: int prior_linear = summarybuffer->chunksummary[0].chunk_start; michael@0: uint16 prior_lang = summarybuffer->chunksummary[0].lang1; michael@0: michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, "
SharpenBoundaries
\n"); michael@0: } michael@0: for (int i = 1; i < summarybuffer->n; ++i) { michael@0: ChunkSummary* cs = &summarybuffer->chunksummary[i]; michael@0: uint16 this_lang = cs->lang1; michael@0: if (this_lang == prior_lang) { michael@0: prior_linear = cs->chunk_start; michael@0: continue; michael@0: } michael@0: michael@0: int this_linear = cs->chunk_start; michael@0: int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; michael@0: michael@0: // If this/prior in same close set, don't move boundary michael@0: if (SameCloseSet(prior_lang, this_lang)) { michael@0: prior_linear = this_linear; michael@0: prior_lang = this_lang; michael@0: continue; michael@0: } michael@0: michael@0: michael@0: // Within hitbuffer->linear[] michael@0: // <-- prior chunk --><-- this chunk --> michael@0: // | | | michael@0: // prior_linear this_linear next_linear michael@0: // prior_lang this_lang michael@0: // The goal of sharpening is to move this_linear to better separate langs michael@0: michael@0: uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, michael@0: static_cast(prior_lang)); michael@0: uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, michael@0: static_cast(this_lang)); michael@0: int better_linear = BetterBoundary(text, michael@0: hitbuffer, michael@0: scoringcontext, michael@0: pslang0, pslang1, michael@0: prior_linear, this_linear, next_linear); michael@0: michael@0: int old_offset = hitbuffer->linear[this_linear].offset; michael@0: int new_offset = hitbuffer->linear[better_linear].offset; michael@0: cs->chunk_start = better_linear; michael@0: cs->offset = new_offset; michael@0: // If this_linear moved right, make bytes smaller for this, larger for prior michael@0: // If this_linear moved left, make bytes larger for this, smaller for prior michael@0: cs->bytes -= (new_offset - old_offset); michael@0: summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); michael@0: michael@0: this_linear = better_linear; // Update so that next chunk doesn't intrude michael@0: michael@0: // Consider rescoring the two chunks michael@0: michael@0: // Update for next round (note: using pre-updated boundary) michael@0: prior_linear = this_linear; michael@0: prior_lang = this_lang; michael@0: } michael@0: } michael@0: michael@0: // Make a langprob that gives small weight to the default language for ulscript michael@0: uint32 DefaultLangProb(ULScript ulscript) { michael@0: Language default_lang = DefaultLanguage(ulscript); michael@0: return MakeLangProb(default_lang, 1); michael@0: } michael@0: michael@0: // Effectively, do a merge-sort based on text offsets michael@0: // Look up each indirect value in appropriate scoring table and keep michael@0: // just the resulting langprobs michael@0: void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: const CLD2TableSummary* base_obj; // unigram or quadgram michael@0: const CLD2TableSummary* base_obj2; // quadgram dual table michael@0: const CLD2TableSummary* delta_obj; // bigram or octagram michael@0: const CLD2TableSummary* distinct_obj; // bigram or octagram michael@0: uint16 base_hit; michael@0: if (score_cjk) { michael@0: base_obj = scoringcontext->scoringtables->unigram_compat_obj; michael@0: base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; michael@0: delta_obj = scoringcontext->scoringtables->deltabi_obj; michael@0: distinct_obj = scoringcontext->scoringtables->distinctbi_obj; michael@0: base_hit = UNIHIT; michael@0: } else { michael@0: base_obj = scoringcontext->scoringtables->quadgram_obj; michael@0: base_obj2 = scoringcontext->scoringtables->quadgram_obj2; michael@0: delta_obj = scoringcontext->scoringtables->deltaocta_obj; michael@0: distinct_obj = scoringcontext->scoringtables->distinctocta_obj; michael@0: base_hit = QUADHIT; michael@0: } michael@0: michael@0: int base_limit = hitbuffer->next_base; michael@0: int delta_limit = hitbuffer->next_delta; michael@0: int distinct_limit = hitbuffer->next_distinct; michael@0: int base_i = 0; michael@0: int delta_i = 0; michael@0: int distinct_i = 0; michael@0: int linear_i = 0; michael@0: michael@0: // Start with an initial base hit for the default language for this script michael@0: // Inserting this avoids edge effects with no hits at all michael@0: hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; michael@0: hitbuffer->linear[linear_i].type = base_hit; michael@0: hitbuffer->linear[linear_i].langprob = michael@0: DefaultLangProb(scoringcontext->ulscript); michael@0: ++linear_i; michael@0: michael@0: while ((base_i < base_limit) || (delta_i < delta_limit) || michael@0: (distinct_i < distinct_limit)) { michael@0: int base_off = hitbuffer->base[base_i].offset; michael@0: int delta_off = hitbuffer->delta[delta_i].offset; michael@0: int distinct_off = hitbuffer->distinct[distinct_i].offset; michael@0: michael@0: // Do delta and distinct first, so that they are not lost at base_limit michael@0: if ((delta_i < delta_limit) && michael@0: (delta_off <= base_off) && (delta_off <= distinct_off)) { michael@0: // Add delta entry michael@0: int indirect = hitbuffer->delta[delta_i].indirect; michael@0: ++delta_i; michael@0: uint32 langprob = delta_obj->kCLDTableInd[indirect]; michael@0: if (langprob > 0) { michael@0: hitbuffer->linear[linear_i].offset = delta_off; michael@0: hitbuffer->linear[linear_i].type = DELTAHIT; michael@0: hitbuffer->linear[linear_i].langprob = langprob; michael@0: ++linear_i; michael@0: } michael@0: } michael@0: else if ((distinct_i < distinct_limit) && michael@0: (distinct_off <= base_off) && (distinct_off <= delta_off)) { michael@0: // Add distinct entry michael@0: int indirect = hitbuffer->distinct[distinct_i].indirect; michael@0: ++distinct_i; michael@0: uint32 langprob = distinct_obj->kCLDTableInd[indirect]; michael@0: if (langprob > 0) { michael@0: hitbuffer->linear[linear_i].offset = distinct_off; michael@0: hitbuffer->linear[linear_i].type = DISTINCTHIT; michael@0: hitbuffer->linear[linear_i].langprob = langprob; michael@0: ++linear_i; michael@0: } michael@0: } michael@0: else { michael@0: // Add one or two base entries michael@0: int indirect = hitbuffer->base[base_i].indirect; michael@0: // First, get right scoring table michael@0: const CLD2TableSummary* local_base_obj = base_obj; michael@0: if ((indirect & 0x80000000u) != 0) { michael@0: local_base_obj = base_obj2; michael@0: indirect &= ~0x80000000u; michael@0: } michael@0: ++base_i; michael@0: // One langprob in kQuadInd[0..SingleSize), michael@0: // two in kQuadInd[SingleSize..Size) michael@0: if (indirect < static_cast(local_base_obj->kCLDTableSizeOne)) { michael@0: // Up to three languages at indirect michael@0: uint32 langprob = local_base_obj->kCLDTableInd[indirect]; michael@0: if (langprob > 0) { michael@0: hitbuffer->linear[linear_i].offset = base_off; michael@0: hitbuffer->linear[linear_i].type = base_hit; michael@0: hitbuffer->linear[linear_i].langprob = langprob; michael@0: ++linear_i; michael@0: } michael@0: } else { michael@0: // Up to six languages at start + 2 * (indirect - start) michael@0: indirect += (indirect - local_base_obj->kCLDTableSizeOne); michael@0: uint32 langprob = local_base_obj->kCLDTableInd[indirect]; michael@0: uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; michael@0: if (langprob > 0) { michael@0: hitbuffer->linear[linear_i].offset = base_off; michael@0: hitbuffer->linear[linear_i].type = base_hit; michael@0: hitbuffer->linear[linear_i].langprob = langprob; michael@0: ++linear_i; michael@0: } michael@0: if (langprob2 > 0) { michael@0: hitbuffer->linear[linear_i].offset = base_off; michael@0: hitbuffer->linear[linear_i].type = base_hit; michael@0: hitbuffer->linear[linear_i].langprob = langprob2; michael@0: ++linear_i; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Update michael@0: hitbuffer->next_linear = linear_i; michael@0: michael@0: // Add a dummy entry off the end, just to capture final offset michael@0: hitbuffer->linear[linear_i].offset = michael@0: hitbuffer->base[hitbuffer->next_base].offset; michael@0: hitbuffer->linear[linear_i].langprob = 0; michael@0: } michael@0: michael@0: // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits michael@0: void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { michael@0: int chunksize; michael@0: uint16 base_hit; michael@0: if (score_cjk) { michael@0: chunksize = kChunksizeUnis; michael@0: base_hit = UNIHIT; michael@0: } else { michael@0: chunksize = kChunksizeQuads; michael@0: base_hit = QUADHIT; michael@0: } michael@0: michael@0: int linear_i = 0; michael@0: int linear_off_end = hitbuffer->next_linear; michael@0: int text_i = letter_offset; // Next unseen text offset michael@0: int next_chunk_start = 0; michael@0: int bases_left = hitbuffer->next_base; michael@0: while (bases_left > 0) { michael@0: // Linearize one chunk michael@0: int base_len = chunksize; // Default; may be changed below michael@0: if (bases_left < (chunksize + (chunksize >> 1))) { michael@0: // If within 1.5 chunks of the end, avoid runts by using it all michael@0: base_len = bases_left; michael@0: } else if (bases_left < (2 * chunksize)) { michael@0: // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) michael@0: base_len = (bases_left + 1) >> 1; michael@0: } michael@0: michael@0: hitbuffer->chunk_start[next_chunk_start] = linear_i; michael@0: hitbuffer->chunk_offset[next_chunk_start] = text_i; michael@0: ++next_chunk_start; michael@0: michael@0: int base_count = 0; michael@0: while ((base_count < base_len) && (linear_i < linear_off_end)) { michael@0: if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} michael@0: ++linear_i; michael@0: } michael@0: text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset michael@0: bases_left -= base_len; michael@0: } michael@0: michael@0: // If no base hits at all, make a single dummy chunk michael@0: if (next_chunk_start == 0) { michael@0: hitbuffer->chunk_start[next_chunk_start] = 0; michael@0: hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; michael@0: ++next_chunk_start; michael@0: } michael@0: michael@0: // Remember the linear array start of dummy entry michael@0: hitbuffer->next_chunk_start = next_chunk_start; michael@0: michael@0: // Add a dummy entry off the end, just to capture final linear subscr michael@0: hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; michael@0: hitbuffer->chunk_offset[next_chunk_start] = text_i; michael@0: } michael@0: michael@0: michael@0: // Merge-sort the individual hit arrays, go indirect on the scoring subscripts, michael@0: // break linear array into chunks. michael@0: // michael@0: // Input: michael@0: // hitbuffer base, delta, distinct arrays michael@0: // Output: michael@0: // linear array michael@0: // chunk_start array michael@0: // michael@0: void LinearizeHitBuffer(int letter_offset, michael@0: ScoringContext* scoringcontext, michael@0: bool more_to_come, bool score_cjk, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: LinearizeAll(scoringcontext, score_cjk, hitbuffer); michael@0: ChunkAll(letter_offset, score_cjk, hitbuffer); michael@0: } michael@0: michael@0: michael@0: michael@0: // The hitbuffer is in an awkward form -- three sets of base/delta/distinct michael@0: // scores, each with an indirect subscript to one of six scoring tables, some michael@0: // of which can yield two langprobs for six languages, others one langprob for michael@0: // three languages. The only correlation between base/delta/distinct is their michael@0: // offsets into the letters-only text buffer. michael@0: // michael@0: // SummaryBuffer needs to be built to linear, giving linear offset of start of michael@0: // each chunk michael@0: // michael@0: // So we first do all the langprob lookups and merge-sort by offset to make michael@0: // a single linear vector, building a side vector of chunk beginnings as we go. michael@0: // The sharpening is simply moving the beginnings, scoring is a simple linear michael@0: // sweep, etc. michael@0: michael@0: void ProcessHitBuffer(const LangSpan& scriptspan, michael@0: int letter_offset, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec, michael@0: bool more_to_come, bool score_cjk, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, "Hitbuffer[) "); michael@0: DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); michael@0: } michael@0: michael@0: LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, michael@0: hitbuffer); michael@0: michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, "Linear[) "); michael@0: DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); michael@0: } michael@0: michael@0: SummaryBuffer summarybuffer; michael@0: summarybuffer.n = 0; michael@0: ChunkSpan last_cspan; michael@0: ScoreAllHits(scriptspan.text, scriptspan.ulscript, michael@0: more_to_come, score_cjk, hitbuffer, michael@0: scoringcontext, michael@0: &summarybuffer, &last_cspan); michael@0: michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); michael@0: } michael@0: michael@0: if (vec != NULL) { michael@0: // Sharpen boundaries of summarybuffer michael@0: // This is not a high-performance path michael@0: SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, michael@0: &summarybuffer); michael@0: // Show after the sharpening michael@0: // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, michael@0: // hitbuffer, scoringcontext, &summarybuffer); michael@0: michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); michael@0: } michael@0: } michael@0: michael@0: SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); michael@0: SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, michael@0: &summarybuffer, more_to_come, vec); michael@0: } michael@0: michael@0: void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { michael@0: // Splice hitbuffer and summarybuffer for next round. With big chunks and michael@0: // distinctive-word state carried across chunks, we might not need to do this. michael@0: hitbuffer->next_base = 0; michael@0: hitbuffer->next_delta = 0; michael@0: hitbuffer->next_distinct = 0; michael@0: hitbuffer->next_linear = 0; michael@0: hitbuffer->next_chunk_start = 0; michael@0: hitbuffer->lowest_offset = next_offset; michael@0: } michael@0: michael@0: michael@0: // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating michael@0: // scoringcontext michael@0: void ScoreEntireScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec) { michael@0: int bytes = scriptspan.text_bytes; michael@0: // Artificially set score to 1024 per 1KB, or 1 per byte michael@0: int score = bytes; michael@0: int reliability = 100; michael@0: // doc_tote uses full languages michael@0: Language one_one_lang = DefaultLanguage(scriptspan.ulscript); michael@0: doc_tote->Add(one_one_lang, bytes, score, reliability); michael@0: michael@0: if (scoringcontext->flags_cld2_html) { michael@0: ChunkSummary chunksummary = { michael@0: 1, 0, michael@0: one_one_lang, UNKNOWN_LANGUAGE, score, 1, michael@0: bytes, 0, scriptspan.ulscript, reliability, reliability michael@0: }; michael@0: CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, michael@0: false, false, NULL, michael@0: scoringcontext, NULL, &chunksummary); michael@0: } michael@0: michael@0: // First byte is always a space michael@0: JustOneItemToVector(scoringcontext->scanner, scriptspan.text, michael@0: one_one_lang, 1, bytes - 1, vec); michael@0: michael@0: scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: } michael@0: michael@0: // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext michael@0: void ScoreCJKScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec) { michael@0: // Allocate three parallel arrays of scoring hits michael@0: ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; michael@0: hitbuffer->init(); michael@0: hitbuffer->ulscript = scriptspan.ulscript; michael@0: michael@0: scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: scoringcontext->oldest_distinct_boost = 0; michael@0: michael@0: // Incoming scriptspan has a single leading space at scriptspan.text[0] michael@0: // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] michael@0: michael@0: int letter_offset = 1; // Skip initial space michael@0: hitbuffer->lowest_offset = letter_offset; michael@0: int letter_limit = scriptspan.text_bytes; michael@0: while (letter_offset < letter_limit) { michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)
\n", michael@0: letter_offset, letter_limit); michael@0: } michael@0: // michael@0: // Fill up one hitbuffer, possibly splicing onto previous fragment michael@0: // michael@0: // NOTE: GetUniHits deals with close repeats michael@0: // NOTE: After last chunk there is always a hitbuffer entry with an offset michael@0: // just off the end of the text = next_offset. michael@0: int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, michael@0: scoringcontext, hitbuffer); michael@0: // NOTE: GetBiHitVectors deals with close repeats, michael@0: // does one hash and two lookups (delta and distinct) per word michael@0: GetBiHits(scriptspan.text, letter_offset, next_offset, michael@0: scoringcontext, hitbuffer); michael@0: michael@0: // michael@0: // Score one hitbuffer in chunks to summarybuffer michael@0: // michael@0: bool more_to_come = next_offset < letter_limit; michael@0: bool score_cjk = true; michael@0: ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, michael@0: more_to_come, score_cjk, hitbuffer); michael@0: SpliceHitBuffer(hitbuffer, next_offset); michael@0: michael@0: letter_offset = next_offset; michael@0: } michael@0: michael@0: delete hitbuffer; michael@0: // Context across buffers is not connected yet michael@0: scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: } michael@0: michael@0: michael@0: michael@0: // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext michael@0: // We have a scriptspan with all lowercase text in one script. Look up michael@0: // quadgrams and octagrams, saving the hits in three parallel vectors. michael@0: // Score from those vectors in chunks, toting each chunk to get a single michael@0: // language, and combining into the overall document score. The hit vectors michael@0: // in general are not big enough to handle and entire scriptspan, so michael@0: // repeat until the entire scriptspan is scored. michael@0: // Caller deals with minimizing numbr of runt scriptspans michael@0: // This routine deals with minimizing number of runt chunks. michael@0: // michael@0: // Returns updated scoringcontext michael@0: // Returns updated doc_tote michael@0: // If vec != NULL, appends to that vector of ResultChunk's michael@0: void ScoreQuadScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec) { michael@0: // Allocate three parallel arrays of scoring hits michael@0: ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; michael@0: hitbuffer->init(); michael@0: hitbuffer->ulscript = scriptspan.ulscript; michael@0: michael@0: scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: scoringcontext->oldest_distinct_boost = 0; michael@0: michael@0: // Incoming scriptspan has a single leading space at scriptspan.text[0] michael@0: // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] michael@0: michael@0: int letter_offset = 1; // Skip initial space michael@0: hitbuffer->lowest_offset = letter_offset; michael@0: int letter_limit = scriptspan.text_bytes; michael@0: while (letter_offset < letter_limit) { michael@0: // michael@0: // Fill up one hitbuffer, possibly splicing onto previous fragment michael@0: // michael@0: // NOTE: GetQuadHits deals with close repeats michael@0: // NOTE: After last chunk there is always a hitbuffer entry with an offset michael@0: // just off the end of the text = next_offset. michael@0: int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, michael@0: scoringcontext, hitbuffer); michael@0: // If true, there is more text to process in this scriptspan michael@0: // NOTE: GetOctaHitVectors deals with close repeats, michael@0: // does one hash and two lookups (delta and distinct) per word michael@0: GetOctaHits(scriptspan.text, letter_offset, next_offset, michael@0: scoringcontext, hitbuffer); michael@0: michael@0: // michael@0: // Score one hitbuffer in chunks to summarybuffer michael@0: // michael@0: bool more_to_come = next_offset < letter_limit; michael@0: bool score_cjk = false; michael@0: ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, michael@0: more_to_come, score_cjk, hitbuffer); michael@0: SpliceHitBuffer(hitbuffer, next_offset); michael@0: michael@0: letter_offset = next_offset; michael@0: } michael@0: michael@0: delete hitbuffer; michael@0: } michael@0: michael@0: michael@0: // Score one scriptspan into doc_tote and vec, updating scoringcontext michael@0: // Inputs: michael@0: // One scriptspan of perhaps 40-60KB, all same script lower-case letters michael@0: // and single ASCII spaces. First character is a space to allow simple michael@0: // begining-of-word detect. End of buffer has three spaces and NUL to michael@0: // allow easy scan-to-end-of-word. michael@0: // Scoring context of michael@0: // scoring tables michael@0: // flags michael@0: // running boosts michael@0: // Outputs: michael@0: // Updated doc_tote giving overall languages and byte counts michael@0: // Optional updated chunk vector giving offset, length, language michael@0: // michael@0: // Caller initializes flags, boosts, doc_tote and vec. michael@0: // Caller aggregates across multiple scriptspans michael@0: // Caller calculates final document result michael@0: // Caller deals with detecting and triggering suppression of repeated text. michael@0: // michael@0: // This top-level routine just chooses the recognition type and calls one of michael@0: // the next-level-down routines. michael@0: // michael@0: void ScoreOneScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec) { michael@0: if (scoringcontext->flags_cld2_verbose) { michael@0: fprintf(scoringcontext->debug_file, "
ScoreOneScriptSpan(%s,%d) ", michael@0: ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); michael@0: // Optionally print the chunk lowercase letters/marks text michael@0: string temp(&scriptspan.text[0], scriptspan.text_bytes); michael@0: fprintf(scoringcontext->debug_file, "'%s'", michael@0: GetHtmlEscapedText(temp).c_str()); michael@0: fprintf(scoringcontext->debug_file, "
\n"); michael@0: } michael@0: scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: scoringcontext->oldest_distinct_boost = 0; michael@0: ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); michael@0: if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { michael@0: rtype = RTypeMany; michael@0: } michael@0: switch (rtype) { michael@0: case RTypeNone: michael@0: case RTypeOne: michael@0: ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); michael@0: break; michael@0: case RTypeCJK: michael@0: ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); michael@0: break; michael@0: case RTypeMany: michael@0: ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); michael@0: break; michael@0: } michael@0: } michael@0: michael@0: } // End namespace CLD2 michael@0: