--- abi/src/wp/impexp/xp/ie_imp_RTF.cpp 2005-06-03 04:19:40.645871064 +0100 +++ abi/src/wp/impexp/xp/ie_imp_RTF.cpp-NEWEST 2005-06-03 04:08:56.000000000 +0100 @@ -919,8 +919,10 @@ // Font table items -RTFFontTableItem::RTFFontTableItem(FontFamilyEnum fontFamily, int charSet, int codepage, FontPitch pitch, - unsigned char* panose, char* pFontName, char* pAlternativeFontName) +RTFFontTableItem::RTFFontTableItem(FontFamilyEnum fontFamily, int charSet, + int codepage, FontPitch pitch, + unsigned char* panose, char* + pFontName, char* pAlternativeFontName) { m_family = fontFamily; m_charSet = charSet; @@ -7919,8 +7921,9 @@ int codepage = 0; unsigned char panose[10]; memset(panose, 0, sizeof(unsigned char)); - char* pFontName = NULL; - char* pAlternativeFontName = NULL; + UT_String sFontName; + UT_String sAlternativeFontName; + bool bFontNameHasHex, bAltFontNameHasHex; RTFTokenType tokenType; //TODO - this should be intialized once for the whole RTF reader. @@ -7974,8 +7977,12 @@ fontFamily = RTFFontTableItem::ffNone; } } - // Now (possibly) comes some optional keyword before the fontname - while (tokenType != RTF_TOKEN_DATA || nesting > 0) + + + // Loop through the rest of the font definition, calling out to + // ReadFontName() to read the font names. + nesting=1; // We should have had a "{" at the start of the font definition. + while (nesting > 0) { tokenType = NextToken(keyword,¶meter,¶mUsed,MAX_KEYWORD_LEN,true); switch (tokenType) @@ -7986,7 +7993,12 @@ case RTF_TOKEN_CLOSE_BRACE: nesting --; break; + // Data indicates the start of the font name. case RTF_TOKEN_DATA: + SkipBackChar(keyword[0]); // Data can only be one byte, right? + if (!ReadFontName(&sFontName, &sAlternativeFontName, + &bFontNameHasHex, &bAltFontNameHasHex)) + return false; break; case RTF_TOKEN_KEYWORD: pValue = const_cast(static_cast(keywordMap.pick(reinterpret_cast(&keyword[0])))); @@ -8038,81 +8050,77 @@ { charSet = parameter; } + // Escaped hex is really data, so this should be the start of the + // font name. + if (strcmp(reinterpret_cast(&keyword[0]),"'") == 0) { + SkipBackChar('\''); + SkipBackChar('\\'); + if (!ReadFontName(&sFontName, &sAlternativeFontName, + &bFontNameHasHex, &bAltFontNameHasHex)) + return false; + } break; default: //TODO: handle errors break; } } - if (nesting == -1) - { - UT_DEBUGMSG(("RTF: Font name not found in font definition %d",fontIndex)); - } - // Now comes the font name, terminated by either a close brace or a slash or a semi-colon - ch = keyword[0]; - int count = 0; - /* - FIXME: CJK font names come in form \'aa\'cd\'ef - so we have to - parse \'HH correctly (currently we ignore them!) - VH - */ - while ( ch != '}' && ch != '\\' && ch != ';' && ch!= '{') - { - keyword[count++] = ch; - if (!ReadCharFromFile(&ch)) - { - return false; - } - } - if (ch=='{') - { - ++nesting; - } - keyword[count] = 0; #ifndef XP_TARGET_COCOA /*work around "helvetica" font name -replace it with "Helvetic"*/ - if (!UT_stricmp(reinterpret_cast(&keyword[0]),"helvetica")) + if (sFontName == "helvetica") { - strcpy(reinterpret_cast(&keyword[0]),"Helvetic"); + sFontName == "Helvetica"; } #endif /* ! XP_TARGET_COCOA */ - if (!UT_cloneString(pFontName, reinterpret_cast(&keyword[0]))) - { - // TODO outofmem + /* TODO: + * + * In an ideal world, this function could just set the FontName and the + * Alternative fontname and not do any of the messing around below. + * Unfortunately, at the moment the rest of abiword doesn't support + * non-ASCII font names and doesn't appear to do anything much with the + * alternative fontname. Therefore, we have to do our best to provide a + * sensible font name here even if the file only specifies a non-ASCII + * name. + * + * Just ignoring non-ASCII characters don't help, because then font names + * like XXXXX_GB2312 get called "_GB2312". Therefore, if the font name + * contains non-ASCII characters and there's an ASCII alternative name + * we use the alternative. If the alternative is missing, or itself + * uses non-ASCII characters we set the font name to + * "UnknownUnicodeFontName". This at least gives the user something + * understandable to see in the font list. Also, if they alias + * UnknownUnicodeFontName to the most likely font for their region (e.g. + * SongTi in China) then most of their documents should display correctly. + */ + if (sFontName.length() == 0 || bFontNameHasHex) { + if (sAlternativeFontName.length() > 0 && !bAltFontNameHasHex) + sFontName = sAlternativeFontName; + else + sFontName = "UnknownUnicodeFontName"; } - for (int i=0; i <= nesting; ++i) - { - // Munch the remaining control words down to the close brace - while (ch != '}') - { - if (ch == ';' && !bNested && (i == nesting)) - { - break; // Cocoa RTF: {\fonttbl\f0\fnil\fcharset78 HiraKakuPro-W3;\f1\fnil\fcharset102 STXihei;} - } - if (!ReadCharFromFile(&ch)) - { - return false; - } - if (ch=='{') - { - ++nesting; - } + + // Clone the font name, or set it to NULL if it was an empty string. + char *fn=NULL; + if (sFontName.length()) { + if (!UT_cloneString(fn, sFontName.c_str())) { + UT_DEBUGMSG(("RTF: Out of memory parsing font table.\n")); + return false; } - if (nesting>0 && i!=nesting) //we need to skip '}' we've just seen. - { - if (!ReadCharFromFile(&ch)) - { - return false; - } + } + // Do the same for the alternative font name. + char *afn=NULL; + if (sAlternativeFontName.length()) { + if (!UT_cloneString(afn, sAlternativeFontName.c_str())) { + UT_DEBUGMSG(("RTF: Out of memory parsing font table.\n")); + return false; } } - + // Create the font entry and put it into the font table RTFFontTableItem* pNewFont = new RTFFontTableItem(fontFamily, charSet, - codepage, pitch, - panose, pFontName, - pAlternativeFontName); + codepage, pitch, panose, fn, afn); if (pNewFont == NULL) { return false; @@ -8134,7 +8142,7 @@ } else { - UT_DEBUGMSG (("RTF: font %d (named %s) already defined. Ignoring\n", fontIndex, pFontName)); + UT_DEBUGMSG (("RTF: font %d (named %s) already defined. Ignoring\n", fontIndex, sFontName.c_str())); DELETEP (pNewFont); } @@ -8142,6 +8150,172 @@ } +/* + * Read the font name of the current font from the input stream. + * If present, also read the alternative font name. This should + * deal correctly with any commands embedded in the font name. + + * Eg: + * {\f18\fnil\fcharset134\fprq2{\*\panose 02010600030101010101} + * \'cb\'ce\'cc\'e5{\*\falt SimSun};} + * + * or even: + * {\f20\froman Times New {\*\unknowncommand Fibble!}Roman;} + * + * + * Currently escaped hex data (\'XX) is discarded since the rest of the + * program cannot cope with non-ASCII fontnames. + */ + +/* The state used while reading in the font name. + * This points to the font name that we are currently writing. Initially + * it's set to point to FontName. We switch it to point to AltFontName + * when we see a \falt command. When the group containing the \falt ends + * we pop the state of the stack and so this pointer reverts back to + * FontName. + */ +struct SFontNameState { + UT_String *pFontName; + bool *pbFontNameHasHex; + bool bSkipping; +}; + +bool IE_Imp_RTF::ReadFontName(UT_String *sFontName, + UT_String *sAltFontName, + bool *bFontNameHasHex, + bool *bAltFontNameHasHex) +{ + unsigned char keyword[MAX_KEYWORD_LEN]; + RTFTokenType tokenType; + UT_sint32 parameter = 0; + unsigned char ch; + bool paramUsed = false; + bool bSeenStar = false; // Was the last keyword "\*"? + UT_Stack stateStack; + // Allocate the initial state on the functions stack so that it gets + // automatically freed when the function returns. (If we used a "new" + // command then we'd need a delete before every return). + SFontNameState baseState; + struct SFontNameState *currentState = &baseState; + struct SFontNameState *oldState; + + // Initialise the current state. + currentState->pFontName = sFontName; + currentState->pbFontNameHasHex = bFontNameHasHex; + currentState->bSkipping = false; + + *bFontNameHasHex = false; + *bAltFontNameHasHex = false; + while (true) + { + // NB: This doesn't ignore whitespace. + tokenType = NextToken(keyword,¶meter,¶mUsed,MAX_KEYWORD_LEN,false); + switch (tokenType) + { + case RTF_TOKEN_OPEN_BRACE: + oldState = currentState; + // Push the current state onto the stack... + stateStack.push(reinterpret_cast(currentState)); + // ...allocate a new one... + currentState = new SFontNameState; + if (!currentState) { + UT_DEBUGMSG(("RTF: Out of memory.\n")); + return false; + } + // ...and initialise it as a copy of the old one. + currentState->pFontName = oldState->pFontName; + currentState->pbFontNameHasHex = oldState->pbFontNameHasHex; + currentState->bSkipping = oldState->bSkipping; + break; + case RTF_TOKEN_CLOSE_BRACE: + // Throw away the current state. + delete currentState; + // Pop an old state off the stack . + if (!stateStack.pop(reinterpret_cast(¤tState))) + { + UT_DEBUGMSG(("RTF: Too many closing parenthesises in font table.\n")); + return false; + } + break; + case RTF_TOKEN_DATA: + // Are we skipping? + if (currentState->bSkipping) + break; + // We found the font name terminator. + if (keyword[0] == ';') + { + if (stateStack.getDepth()==0) + return true; + else + { + UT_DEBUGMSG(("RTF: Too many opening parenthesises in font table.\n")); + // Memory clean up: loop freeing everything left on the + // stack. + while (stateStack.getDepth() > 0) + { + stateStack.pop(reinterpret_cast(¤tState)); + if (currentState) + delete currentState; + } + return false; + } + } + // Other data must be one of the font names, so write it to the + // current font name pointer. + *(currentState->pFontName) += keyword[0]; + break; + case RTF_TOKEN_KEYWORD: + // Are we skipping? + if (currentState->bSkipping) + break; + + // Handle hex escaped data. + if (strcmp(reinterpret_cast(&keyword[0]),"'") == 0) + { + // Sadly the Abi backend lacks supports for multibyte fontnames. + // Thus, we have to skip over hex sequences and hope for an + // alternate font name. + // TODO: This will also ignore hex escaped ASCII, which would be + // legal in the file, if a little strange. + if ( !ReadCharFromFile(&ch) || !ReadCharFromFile(&ch) ) + { + return false; + } + // Record that we've seen hex. + *(currentState->pbFontNameHasHex) = true; + break; // Break out after handling keyword. + } + // Handle the "*" keyword. + if (strcmp(reinterpret_cast(&keyword[0]),"*") == 0) + { + bSeenStar = true; + break; // Break out after handling keyword. + } + // Handle "\falt" keyword. + if (strcmp(reinterpret_cast(&keyword[0]),"falt") == 0) + { + // Change the font name pointer so that data will be written to + // the alternative fontname. + currentState->pFontName = sAltFontName; + currentState->pbFontNameHasHex = bAltFontNameHasHex; + break; + } + + // If we get here then this is an unknown keyword. + if (bSeenStar) + { + bSeenStar = false; + currentState->bSkipping = true; + } + break; + case RTF_TOKEN_NONE: + UT_DEBUGMSG(("Premature end of file reading font table.\n")); + return false; + default: + break; + } // switch + }; // while +} ////////////////////////////////////////////////////////////////////////////// --- abi/src/wp/impexp/xp/ie_imp_RTF.h 2005-06-03 04:10:01.619896392 +0100 +++ abi/src/wp/impexp/xp/ie_imp_RTF.h-NEWEST 2005-06-03 04:09:06.000000000 +0100 @@ -612,6 +612,7 @@ bool ReadColourTable(); bool ReadFontTable(); bool ReadOneFontFromTable(bool bNested); + bool ReadFontName(UT_String *sFontName, UT_String *sAltFontName, bool *bFontNameHasHex, bool *bAltFontNameHasHex); bool ReadRevisionTable(); void setEncoding(); public: