michael@0: /*
michael@0:  *******************************************************************************
michael@0:  *   Copyright (C) 2003-2007, International Business Machines
michael@0:  *   Corporation and others.  All Rights Reserved.
michael@0:  *******************************************************************************
michael@0:  *
michael@0:  * File prscmnts.cpp
michael@0:  *
michael@0:  * Modification History:
michael@0:  *
michael@0:  *   Date          Name        Description
michael@0:  *   08/22/2003    ram         Creation.
michael@0:  *******************************************************************************
michael@0:  */
michael@0: 
michael@0: #include "unicode/regex.h"
michael@0: #include "unicode/unistr.h"
michael@0: #include "unicode/parseerr.h"
michael@0: #include "prscmnts.h"
michael@0: #include <stdio.h>
michael@0: #include <stdlib.h>
michael@0: 
michael@0: U_NAMESPACE_USE
michael@0: 
michael@0: #if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when RegularExpressions not available */
michael@0: 
michael@0: #define MAX_SPLIT_STRINGS 20
michael@0: 
michael@0: const char *patternStrings[UPC_LIMIT]={
michael@0:     "^translate\\s*(.*)",
michael@0:     "^note\\s*(.*)"
michael@0: };
michael@0: 
michael@0: U_CFUNC int32_t 
michael@0: removeText(UChar *source, int32_t srcLen, 
michael@0:            UnicodeString patString,uint32_t options,  
michael@0:            UnicodeString replaceText, UErrorCode *status){
michael@0: 
michael@0:     if(status == NULL || U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     UnicodeString src(source, srcLen);
michael@0: 
michael@0:     RegexMatcher    myMatcher(patString, src, options, *status);
michael@0:     if(U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0:     UnicodeString dest;
michael@0: 
michael@0: 
michael@0:     dest = myMatcher.replaceAll(replaceText,*status);
michael@0:     
michael@0:     
michael@0:     return dest.extract(source, srcLen, *status);
michael@0: 
michael@0: }
michael@0: U_CFUNC int32_t
michael@0: trim(UChar *src, int32_t srcLen, UErrorCode *status){
michael@0:      srcLen = removeText(src, srcLen, "^[ \\r\\n]+ ", 0, "", status); // remove leading new lines
michael@0:      srcLen = removeText(src, srcLen, "^\\s+", 0, "", status); // remove leading spaces
michael@0:      srcLen = removeText(src, srcLen, "\\s+$", 0, "", status); // remvoe trailing spcaes
michael@0:      return srcLen;
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t 
michael@0: removeCmtText(UChar* source, int32_t srcLen, UErrorCode* status){
michael@0:     srcLen = trim(source, srcLen, status);
michael@0:     UnicodeString     patString = "^\\s*?\\*\\s*?";     // remove pattern like " * " at the begining of the line
michael@0:     srcLen = removeText(source, srcLen, patString, UREGEX_MULTILINE, "", status);
michael@0:     return removeText(source, srcLen, "[ \\r\\n]+", 0, " ", status);// remove new lines;
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t 
michael@0: getText(const UChar* source, int32_t srcLen,
michael@0:         UChar** dest, int32_t destCapacity,
michael@0:         UnicodeString patternString, 
michael@0:         UErrorCode* status){
michael@0:     
michael@0:     if(status == NULL || U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
michael@0:     RegexPattern      *pattern = RegexPattern::compile("@", 0, *status);
michael@0:     UnicodeString src (source,srcLen);
michael@0:     
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     }
michael@0:     pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
michael@0:     
michael@0:     RegexMatcher matcher(patternString, UREGEX_DOTALL, *status);
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     }
michael@0:     for(int32_t i=0; i<MAX_SPLIT_STRINGS; i++){
michael@0:         matcher.reset(stringArray[i]);
michael@0:         if(matcher.lookingAt(*status)){
michael@0:             UnicodeString out = matcher.group(1, *status);
michael@0: 
michael@0:             return out.extract(*dest, destCapacity,*status);
michael@0:         }
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: 
michael@0: #define AT_SIGN  0x0040
michael@0: 
michael@0: U_CFUNC int32_t
michael@0: getDescription( const UChar* source, int32_t srcLen,
michael@0:                 UChar** dest, int32_t destCapacity,
michael@0:                 UErrorCode* status){
michael@0:     if(status == NULL || U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
michael@0:     RegexPattern      *pattern = RegexPattern::compile("@", UREGEX_MULTILINE, *status);
michael@0:     UnicodeString src(source, srcLen);
michael@0:     
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     }
michael@0:     pattern->split(src, stringArray,MAX_SPLIT_STRINGS , *status);
michael@0: 
michael@0:     if(stringArray[0].indexOf((UChar)AT_SIGN)==-1){
michael@0:         int32_t destLen =  stringArray[0].extract(*dest, destCapacity, *status);
michael@0:         return trim(*dest, destLen, status);
michael@0:     }
michael@0:     return 0;
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t
michael@0: getCount(const UChar* source, int32_t srcLen, 
michael@0:          UParseCommentsOption option, UErrorCode *status){
michael@0:     
michael@0:     if(status == NULL || U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
michael@0:     RegexPattern      *pattern = RegexPattern::compile("@", UREGEX_MULTILINE, *status);
michael@0:     UnicodeString src (source, srcLen);
michael@0: 
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     }
michael@0:     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
michael@0:     
michael@0:     RegexMatcher matcher(patternStrings[option], UREGEX_DOTALL, *status);
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     } 
michael@0:     int32_t count = 0;
michael@0:     for(int32_t i=0; i<retLen; i++){
michael@0:         matcher.reset(stringArray[i]);
michael@0:         if(matcher.lookingAt(*status)){
michael@0:             count++;
michael@0:         }
michael@0:     }
michael@0:     if(option == UPC_TRANSLATE && count > 1){
michael@0:         fprintf(stderr, "Multiple @translate tags cannot be supported.\n");
michael@0:         exit(U_UNSUPPORTED_ERROR);
michael@0:     }
michael@0:     return count;
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t 
michael@0: getAt(const UChar* source, int32_t srcLen,
michael@0:         UChar** dest, int32_t destCapacity,
michael@0:         int32_t index,
michael@0:         UParseCommentsOption option,
michael@0:         UErrorCode* status){
michael@0: 
michael@0:     if(status == NULL || U_FAILURE(*status)){
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     UnicodeString     stringArray[MAX_SPLIT_STRINGS];
michael@0:     RegexPattern      *pattern = RegexPattern::compile("@", UREGEX_MULTILINE, *status);
michael@0:     UnicodeString src (source, srcLen);
michael@0: 
michael@0: 
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     }
michael@0:     int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status);
michael@0:     
michael@0:     RegexMatcher matcher(patternStrings[option], UREGEX_DOTALL, *status);
michael@0:     if (U_FAILURE(*status)) {
michael@0:         return 0;
michael@0:     } 
michael@0:     int32_t count = 0;
michael@0:     for(int32_t i=0; i<retLen; i++){
michael@0:         matcher.reset(stringArray[i]);
michael@0:         if(matcher.lookingAt(*status)){
michael@0:             if(count == index){
michael@0:                 UnicodeString out = matcher.group(1, *status);
michael@0:                 return out.extract(*dest, destCapacity,*status);
michael@0:             }
michael@0:             count++;
michael@0:             
michael@0:         }
michael@0:     }
michael@0:     return 0;
michael@0: 
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t
michael@0: getTranslate( const UChar* source, int32_t srcLen,
michael@0:               UChar** dest, int32_t destCapacity,
michael@0:               UErrorCode* status){
michael@0:     UnicodeString     notePatternString = "^translate\\s*?(.*)"; 
michael@0:     
michael@0:     int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status);
michael@0:     return trim(*dest, destLen, status);
michael@0: }
michael@0: 
michael@0: U_CFUNC int32_t 
michael@0: getNote(const UChar* source, int32_t srcLen,
michael@0:         UChar** dest, int32_t destCapacity,
michael@0:         UErrorCode* status){
michael@0: 
michael@0:     UnicodeString     notePatternString = "^note\\s*?(.*)"; 
michael@0:     int32_t destLen =  getText(source, srcLen, dest, destCapacity, notePatternString, status);
michael@0:     return trim(*dest, destLen, status);
michael@0: 
michael@0: }
michael@0: 
michael@0: #endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */
michael@0: