michael@0: // michael@0: // file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class michael@0: // michael@0: /* michael@0: *************************************************************************** michael@0: * Copyright (C) 2002-2011 International Business Machines Corporation michael@0: * and others. All rights reserved. michael@0: *************************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/uchar.h" michael@0: #include "unicode/parsepos.h" michael@0: michael@0: #include "umutex.h" michael@0: michael@0: #include "rbbirb.h" michael@0: #include "rbbinode.h" michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents michael@0: // when the hash table is deleted. michael@0: // michael@0: U_CDECL_BEGIN michael@0: static void U_CALLCONV RBBISymbolTableEntry_deleter(void *p) { michael@0: icu::RBBISymbolTableEntry *px = (icu::RBBISymbolTableEntry *)p; michael@0: delete px; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status) michael@0: :fRules(rules), fRuleScanner(rs), ffffString(UChar(0xffff)) michael@0: { michael@0: fHashTable = NULL; michael@0: fCachedSetLookup = NULL; michael@0: michael@0: fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, &status); michael@0: // uhash_open checks status michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter); michael@0: } michael@0: michael@0: michael@0: michael@0: RBBISymbolTable::~RBBISymbolTable() michael@0: { michael@0: uhash_close(fHashTable); michael@0: } michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTable::lookup This function from the abstract symbol table inteface michael@0: // looks up a variable name and returns a UnicodeString michael@0: // containing the substitution text. michael@0: // michael@0: // The variable name does NOT include the leading $. michael@0: // michael@0: const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const michael@0: { michael@0: RBBISymbolTableEntry *el; michael@0: RBBINode *varRefNode; michael@0: RBBINode *exprNode; michael@0: RBBINode *usetNode; michael@0: const UnicodeString *retString; michael@0: RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const michael@0: michael@0: el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &s); michael@0: if (el == NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: varRefNode = el->val; michael@0: exprNode = varRefNode->fLeftChild; // Root node of expression for variable michael@0: if (exprNode->fType == RBBINode::setRef) { michael@0: // The $variable refers to a single UnicodeSet michael@0: // return the ffffString, which will subsequently be interpreted as a michael@0: // stand-in character for the set by RBBISymbolTable::lookupMatcher() michael@0: usetNode = exprNode->fLeftChild; michael@0: This->fCachedSetLookup = usetNode->fInputSet; michael@0: retString = &ffffString; michael@0: } michael@0: else michael@0: { michael@0: // The variable refers to something other than just a set. michael@0: // return the original source string for the expression michael@0: retString = &exprNode->fText; michael@0: This->fCachedSetLookup = NULL; michael@0: } michael@0: return retString; michael@0: } michael@0: michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTable::lookupMatcher This function from the abstract symbol table michael@0: // interface maps a single stand-in character to a michael@0: // pointer to a Unicode Set. The Unicode Set code uses this michael@0: // mechanism to get all references to the same $variable michael@0: // name to refer to a single common Unicode Set instance. michael@0: // michael@0: // This implementation cheats a little, and does not maintain a map of stand-in chars michael@0: // to sets. Instead, it takes advantage of the fact that the UnicodeSet michael@0: // constructor will always call this function right after calling lookup(), michael@0: // and we just need to remember what set to return between these two calls. michael@0: const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const michael@0: { michael@0: UnicodeSet *retVal = NULL; michael@0: RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const michael@0: if (ch == 0xffff) { michael@0: retVal = fCachedSetLookup; michael@0: This->fCachedSetLookup = 0; michael@0: } michael@0: return retVal; michael@0: } michael@0: michael@0: // michael@0: // RBBISymbolTable::parseReference This function from the abstract symbol table interface michael@0: // looks for a $variable name in the source text. michael@0: // It does not look it up, only scans for it. michael@0: // It is used by the UnicodeSet parser. michael@0: // michael@0: // This implementation is lifted pretty much verbatim michael@0: // from the rules based transliterator implementation. michael@0: // I didn't see an obvious way of sharing it. michael@0: // michael@0: UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text, michael@0: ParsePosition& pos, int32_t limit) const michael@0: { michael@0: int32_t start = pos.getIndex(); michael@0: int32_t i = start; michael@0: UnicodeString result; michael@0: while (i < limit) { michael@0: UChar c = text.charAt(i); michael@0: if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { michael@0: break; michael@0: } michael@0: ++i; michael@0: } michael@0: if (i == start) { // No valid name chars michael@0: return result; // Indicate failure with empty string michael@0: } michael@0: pos.setIndex(i); michael@0: text.extractBetween(start, i, result); michael@0: return result; michael@0: } michael@0: michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTable::lookupNode Given a key (a variable name), return the michael@0: // corresponding RBBI Node. If there is no entry michael@0: // in the table for this name, return NULL. michael@0: // michael@0: RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{ michael@0: michael@0: RBBINode *retNode = NULL; michael@0: RBBISymbolTableEntry *el; michael@0: michael@0: el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key); michael@0: if (el != NULL) { michael@0: retNode = el->val; michael@0: } michael@0: return retNode; michael@0: } michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTable::addEntry Add a new entry to the symbol table. michael@0: // Indicate an error if the name already exists - michael@0: // this will only occur in the case of duplicate michael@0: // variable assignments. michael@0: // michael@0: void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) { michael@0: RBBISymbolTableEntry *e; michael@0: /* test for buffer overflows */ michael@0: if (U_FAILURE(err)) { michael@0: return; michael@0: } michael@0: e = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key); michael@0: if (e != NULL) { michael@0: err = U_BRK_VARIABLE_REDFINITION; michael@0: return; michael@0: } michael@0: michael@0: e = new RBBISymbolTableEntry; michael@0: if (e == NULL) { michael@0: err = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: e->key = key; michael@0: e->val = val; michael@0: uhash_put( fHashTable, &e->key, e, &err); michael@0: } michael@0: michael@0: michael@0: RBBISymbolTableEntry::RBBISymbolTableEntry() : UMemory(), key(), val(NULL) {} michael@0: michael@0: RBBISymbolTableEntry::~RBBISymbolTableEntry() { michael@0: // The "val" of a symbol table entry is a variable reference node. michael@0: // The l. child of the val is the rhs expression from the assignment. michael@0: // Unlike other node types, children of variable reference nodes are not michael@0: // automatically recursively deleted. We do it manually here. michael@0: delete val->fLeftChild; michael@0: val->fLeftChild = NULL; michael@0: michael@0: delete val; michael@0: michael@0: // Note: the key UnicodeString is destructed by virtue of being in the object by value. michael@0: } michael@0: michael@0: michael@0: // michael@0: // RBBISymbolTable::print Debugging function, dump out the symbol table contents. michael@0: // michael@0: #ifdef RBBI_DEBUG michael@0: void RBBISymbolTable::rbbiSymtablePrint() const { michael@0: RBBIDebugPrintf("Variable Definitions\n" michael@0: "Name Node Val String Val\n" michael@0: "----------------------------------------------------------------------\n"); michael@0: michael@0: int32_t pos = -1; michael@0: const UHashElement *e = NULL; michael@0: for (;;) { michael@0: e = uhash_nextElement(fHashTable, &pos); michael@0: if (e == NULL ) { michael@0: break; michael@0: } michael@0: RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; michael@0: michael@0: RBBI_DEBUG_printUnicodeString(s->key, 15); michael@0: RBBIDebugPrintf(" %8p ", (void *)s->val); michael@0: RBBI_DEBUG_printUnicodeString(s->val->fLeftChild->fText); michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: michael@0: RBBIDebugPrintf("\nParsed Variable Definitions\n"); michael@0: pos = -1; michael@0: for (;;) { michael@0: e = uhash_nextElement(fHashTable, &pos); michael@0: if (e == NULL ) { michael@0: break; michael@0: } michael@0: RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer; michael@0: RBBI_DEBUG_printUnicodeString(s->key); michael@0: s->val->fLeftChild->printTree(TRUE); michael@0: RBBIDebugPrintf("\n"); michael@0: } michael@0: } michael@0: #endif michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */