diff -uNr abi-old/src/af/gr/xp/gr_Graphics.cpp abi/src/af/gr/xp/gr_Graphics.cpp --- abi-old/src/af/gr/xp/gr_Graphics.cpp 2005-03-05 05:30:33.000000000 +0000 +++ abi/src/af/gr/xp/gr_Graphics.cpp 2005-03-05 08:09:22.000000000 +0000 @@ -1095,23 +1095,77 @@ } /*! - return true if linebreak at character c is permissible - the built-in class is too simple to differentiate between breaks before and after character + * return true if linebreak after/before character c is permissible. + * + * When editing Chinese documents certain characters cannot be allowed + * to start a line (eg. ".,)>>}") and certain others cannot end a line + * (eg. "(<<{"). Also, a few characters like "--" must occur in pairs + * which cannot be split. This function returns true or false in such a + * way as to ensure these rules. The characters affected are defined + * in the table in xap_EncodingManager.cpp */ -bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool /* bAfter */) +bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool bAfter) { + UT_return_val_if_fail(getApp(), false); + const class XAP_EncodingManager *EncMan=getApp()->getEncodingManager(); + struct cjk_prop_ cjkprop[3]; + UT_UCS4Char c[3]; + int i; + + UT_ASSERT(EncMan); + iNext = -1; // we do not bother with this UT_return_val_if_fail(ri.m_pText && ri.m_pText->getStatus() == UTIter_OK, false); *(ri.m_pText) += ri.m_iOffset; UT_return_val_if_fail(ri.m_pText->getStatus() == UTIter_OK, false); - UT_UCS4Char c = ri.m_pText->getChar(); - - UT_return_val_if_fail(getApp(), false); - return getApp()->getEncodingManager()->can_break_at(c); + /* + * For CJK we need to consider the characters either side as well. + */ + c[1] = ri.m_pText->getChar(); + --(*ri.m_pText); + c[0] = ri.m_pText->getChar(); + ++(*ri.m_pText); ++(*ri.m_pText); + c[2] = ri.m_pText->getChar(); + + // Get the CJK properties of the three chars. + for (i=0; i<3; i++) + cjkprop[i] = EncMan->char_cjk_prop(c[i]); + +#if 1 + // This is just for debugging + printf("KAY: Break %s ", bAfter?"after":"before"); + for (i=0; i<3; i++) { + printf("c%d: %x(%s%s%s%s) ", i, c[i], + cjkprop[i].cjk?"C":".", + cjkprop[i].cant_start_line?"S":".", + cjkprop[i].cant_end_line?"E":".", + cjkprop[i].must_not_split?"N":"."); + } + printf("\n"); +#endif + + if (bAfter) // We're being asked about breaking after. + { + if (cjkprop[1].must_not_split && c[1]==c[2]) + return false; + if (cjkprop[1].cant_end_line || !cjkprop[1].cjk) + return EncMan->can_break_at(c[1], bAfter); + return !cjkprop[2].cant_start_line; + } + else // We're being asked about breaking before. + { + if (cjkprop[1].must_not_split && c[1]==c[0]) + return false; + if (cjkprop[1].cant_start_line || !cjkprop[1].cjk) + return EncMan->can_break_at(c[1], bAfter); + return !cjkprop[1].cant_end_line; + } + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); } + /*! resetJustification() makes the data represented by ri unjustified and returns value by which the total width changed as a result such diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.cpp abi/src/af/xap/xp/xap_EncodingManager.cpp --- abi-old/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-05 05:30:41.000000000 +0000 +++ abi/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-05 08:50:29.000000000 +0000 @@ -646,6 +646,7 @@ }; +#if 0 // This function is made obsolete by rules in gr_Graphics::canBreak() /* TODO I'm pretty sure you can't break Korean at any character. TODO And what about Japanese Katakana and Hiragana? @@ -656,6 +657,109 @@ {"1",cjk_languages}, {NULL} }; +#endif + +/* + * + * A list of characters that cannot appear at the beginning + * of a line. These must be in NUMERICAL ORDER. + * + * This is mainly for CJK languages in which we can break between + * any character. + */ +static const UT_UCS4Char cjk_cant_start_line[]= +{ + 0x0021, // ASCII eclamation mark. + 0x0029, // ASCII right parenthesis + 0x002e, // ASCII full stop. + 0x002c, // ASCII colon. + 0x003b, // ASCII semicolon. + 0x003e, // ASCII greater-than sign. + 0x003f, // ASCII question mark. + 0x2019, // Right single quotation mark. (fullwidth?) + 0x201d, // Right double quotation marks. (fullwidth?) + 0x3001, // Ideographic comma. + 0x3002, // Ideographic full stop. + 0x300b, // Right double angle bracket. + 0xff01, // Fullwidth exclamation mark. + 0xff09, // Fullwidth right parenthesis. + 0xff0c, // Full width comma. + 0xff1a, // Fullwidth colon. + 0xff1b, // Fullwidth semicolon. + 0xfe1e, // Fullwidth greater-than sign. + 0xff1f, // Fullwidth question mark. + 0xff3d, // Fullwidth right square bracket. + 0xff5d, // Fullwidth right curly bracket. + 0x0 +}; + +/* These must also be in NUMERICAL ORDER. */ +static const UT_UCS4Char cjk_cant_end_line[]= +{ + 0x0028, // ASCII left parenthesis. + 0x003c, // ASCII less-than sign. + 0x201c, // Left double quotation mark. + 0x300a, // Left double angle bracket. + 0xff08, // Fullwidth left parenthesis. + 0xff1c, // Fullwifth less-than sign. + 0xff3b, // Fullwidth left square bracket. + 0xff5b, // Fullwidth left curly bracket. + 0x0 +}; + +/* + * A list of characters that, when in pairs, must not be split. + * These must also be in NUMERICAL ORDER. + */ +static const UT_UCS4Char cjk_must_not_split[]= +{ + 0x2014, // EM Dash. + 0x2036 // Horizontal ellipsis. +}; + +/* + * Returns true if "c" is in "list". + */ +static bool is_in_list(UT_UCS4Char c, const UT_UCS4Char *list) +{ + int i=0; + + while ((c >= list[i]) && (list[i] != 0)) { + if (list[i]==c) + return true; + i++; + } + return false; +} + +/* + * Given a characte rreturns a value that indicates if it is a + * CJK character and if so is it forbidden from the beginning or + * end of a line. + */ +struct cjk_prop_ XAP_EncodingManager::char_cjk_prop(UT_UCS4Char c) const +{ + struct cjk_prop_ prop = {false, false, false}; + + // Does this fall into any of the ranges for CJK characters? + if ((c>=0x20000 && c<=0x2a6df) || (c>=0x2f800 && c<=0x2fa1f) || + (c>=0x3000 && c<=0x303f) || (c>=0x3200 && c<=0x32ff) || + (c>=0x3300 && c<=0x33ff) || (c>=0x3400 && c<=0x4dbf) || + (c>=0x4e00 && c<=0x9faf) || (c>=0xf900 && c<=0xfaff) || + (c>=0xfe30 && c<=0xfe4f)) + prop.cjk = true; + + if (is_in_list(c, cjk_cant_start_line)) + prop.cant_start_line = true; + + if (is_in_list(c, cjk_cant_end_line)) + prop.cant_end_line = true; + + if (is_in_list(c, cjk_must_not_split)) + prop.must_not_split = true; + + return prop; +} /* This table is useful since some iconv implementations don't know some cpNNNN @@ -1111,8 +1215,11 @@ { const char* str = search_rmap_with_opt_suffix(langcode_to_cjk,SEARCH_PARAMS); is_cjk_ = *str == '1'; - str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS); - can_break_words_ = *str == '1'; +// This is made obsolete by new CJK line break handling in +// gr_Graphics::canBreak. +// +// str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS); +// can_break_words_ = *str == '1'; } { if (cjk_locale()) { @@ -1160,10 +1267,13 @@ int XAP_EncodingManager__swap_stou,XAP_EncodingManager__swap_utos; + +#if 0 // This is made obsolete by new CJK line breaking in gr_Graphics::canBreak bool XAP_EncodingManager::can_break_words() const { return can_break_words_; } +#endif /* I'm not sure whether any non-cjk language doesn't make distinction @@ -1172,38 +1282,101 @@ */ bool XAP_EncodingManager::single_case() const { return cjk_locale(); } +#if 0 // This function is now performed by XAP_EncodingManager::char_cjk_prop bool XAP_EncodingManager::is_cjk_letter(UT_UCSChar c) const { if (!cjk_locale()) return 0; return (c>0xff); } +#endif bool XAP_EncodingManager::noncjk_letters(const UT_UCSChar* str,int len) const { if (!cjk_locale()) return 1; for(int i=0;i + 0x003f, // ? + 0x005d, // ] + 0x007d, // } + 0x2010, // UCS_HYPHEN + 0x2013, // UCS_EN_DASH + 0x2014, // UCS_EM_DASH + 0x3002, // Ideographic full stop + 0x300b, // Right double angle bracket + 0xff01, // Fullwidth exclamation mark + 0xff09, // Fullwidth right parenthesis + 0xff0c, // Fullwidth comma + 0xff0d, // Fullwidth hypen-minus + 0xff1a, // Fullwidth colon + 0xff1b, // Fullwidth semi-colon + 0xff1f, // Fullwidth question mark + 0xff3d, // Fullwidth right square bracket + 0xff5d, // Fullwidth right curly bracket + 0x0 +}; + +static UT_UCSChar can_break_before[] = +{ + 0x0028, // ( + 0x003c, // < + 0x005b, // [ + 0x007b, // { + 0x300a, // Left double angle bracket + 0xff08, // Fullwidth left parenthesis + 0xff3b, // Fullwidth left square bracket + 0xff5b, // Fullwidth left curly bracket + 0x0 +}; + /* - This one correlates with can_break_words() very tightly. - Under CJK locales it returns 1 for cjk letters. - Under non-CJK locales returns 0. + * It appears that without the benefit of spaces we have to be much more + * consistent about whether a character can be broken before or after. So using + * the above tables we deal with punctuation more accurately. Eg, you can + * break after a comma but not before. */ -bool XAP_EncodingManager::can_break_at(const UT_UCSChar c) const +bool XAP_EncodingManager::can_break_at(const UT_UCSChar c, bool bAfter) const { - if (c == UCS_SPACE + if (bAfter) + { + if (is_in_list(c, can_break_after)) + return true; + } + else + { + if (is_in_list(c, can_break_before)) + return true; + } + return false; + +/* if (c == UCS_SPACE || c == UCS_MINUS || c == UCS_HYPHEN || c == UCS_EN_DASH || c == UCS_EM_DASH) return 1; - return is_cjk_letter(c); + + return 0; */ +// return is_cjk_letter(c); // CJK line breaking is now handled in +// GR_Graphics::canBreak(). } @@ -1267,7 +1440,10 @@ "--->8--------------\n" " WinLanguageCode is 0x%04x, WinCharsetCode is %d\n" - " cjk_locale %d, can_break_words %d, swap_utos %d, swap_stou %d\n", + " cjk_locale %d, " + // "can_break_words %d, " Made obsolete by new CJK line breaking in + // gr_Graphics:canBreak + "swap_utos %d, swap_stou %d\n", getLanguageISOName(), getLanguageISOTerritory() ? getLanguageISOTerritory() : "NULL", getNativeEncodingName(),getNativeSystemEncodingName(), getNative8BitEncodingName(),getNativeNonUnicodeEncodingName(), @@ -1275,7 +1451,7 @@ fallbackChar(1072), getTexPrologue(), getWinLanguageCode(), getWinCharsetCode(), - int(cjk_locale()), int(can_break_words()),int(swap_utos),int(swap_stou) + int(cjk_locale()), /*int(can_break_words()),*/ int(swap_utos),int(swap_stou) )); UT_ASSERT( UT_iconv_isValid(iconv_handle_N2U) && UT_iconv_isValid(iconv_handle_U2N) ); } diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.h abi/src/af/xap/xp/xap_EncodingManager.h --- abi-old/src/af/xap/xp/xap_EncodingManager.h 2005-03-05 05:30:41.000000000 +0000 +++ abi/src/af/xap/xp/xap_EncodingManager.h 2005-03-05 08:08:03.000000000 +0000 @@ -38,6 +38,17 @@ #include "ut_iconv.h" #include "ut_xml.h" +/* + * CJK character properties struct. + */ +struct cjk_prop_ { + bool cjk; + bool cant_end_line; + bool cant_start_line; + bool must_not_split; +}; + + struct ABI_EXPORT XAP_LangInfo { /*no memeber can have NULL value. If string is empty, then value is @@ -169,7 +180,7 @@ /* whether words can be broken at any character of the word (wide character, not byte). True for japanese. */ - virtual bool can_break_words() const; +// virtual bool can_break_words() const; /* returns true if there is no distinction between upper and lower @@ -189,12 +200,18 @@ Under CJK locales it returns 1 for cjk letters. Under non-CJK locales returns 0. */ - virtual bool can_break_at(const UT_UCSChar c) const; + virtual bool can_break_at(const UT_UCSChar c, bool bAfter) const; /* This should be as precise as possible. */ - virtual bool is_cjk_letter(UT_UCSChar c) const; +// virtual bool is_cjk_letter(UT_UCSChar c) const; + + /* + * Returns a value that indicates if c is a CJK character and, if so, + * what it's line breaking properties are. + */ + virtual struct cjk_prop_ char_cjk_prop(UT_UCS4Char c) const; /* This is rather smart wrapper for wvLIDToCodePageConverter. diff -uNr abi-old/src/text/fmt/xp/fp_TextRun.cpp abi/src/text/fmt/xp/fp_TextRun.cpp --- abi-old/src/text/fmt/xp/fp_TextRun.cpp 2005-03-05 05:30:49.000000000 +0000 +++ abi/src/text/fmt/xp/fp_TextRun.cpp 2005-03-05 06:23:11.000000000 +0000 @@ -355,7 +355,7 @@ PD_StruxIterator text(getBlock()->getStruxDocHandle(), getBlockOffset() + fl_BLOCK_STRUX_OFFSET); UT_return_val_if_fail(text.getStatus() == UTIter_OK, false); - text.setUpperLimit(text.getPosition() + getLength() - 1); + //text.setUpperLimit(text.getPosition() + getLength() - 1); UT_return_val_if_fail(m_pRenderInfo, false); m_pRenderInfo->m_pText = &text; @@ -389,7 +389,7 @@ getBlockOffset() + fl_BLOCK_STRUX_OFFSET ); UT_return_val_if_fail(text.getStatus() == UTIter_OK, false); - text.setUpperLimit(text.getPosition() + getLength() - 1); +// text.setUpperLimit(text.getPosition() + getLength() - 1); UT_return_val_if_fail(m_pRenderInfo, false); m_pRenderInfo->m_pText = &text; @@ -532,7 +532,7 @@ offset + fl_BLOCK_STRUX_OFFSET); m_pRenderInfo->m_pText = &text; - text.setUpperLimit(text.getPosition() + getLength() - 1); +// text.setUpperLimit(text.getPosition() + getLength() - 1); UT_uint32 iPosStart = text.getPosition(); //bool bReverse = (getVisDirection() == UT_BIDI_RTL); @@ -564,8 +564,8 @@ text.setPosition(iPos); } - if (bForce || iNext == (UT_sint32)i || bCanBreak) - // && ((i + offset) != (getBlockOffset() + getLength() - 1)) + if (bForce || iNext == (UT_sint32)i || bCanBreak + && ((i + offset) != (getBlockOffset() + getLength() - 1)))// KAY: Enabled { UT_sint32 ispace = 0; if(c == UCS_SPACE) diff -uNr abi-old/src/text/fmt/xp/fv_View.cpp abi/src/text/fmt/xp/fv_View.cpp --- abi-old/src/text/fmt/xp/fv_View.cpp 2005-03-05 05:30:49.000000000 +0000 +++ abi/src/text/fmt/xp/fv_View.cpp 2005-03-05 06:23:11.000000000 +0000 @@ -9856,8 +9856,10 @@ // CJK-FIXME: this can work incorrectly under CJK locales // since it can give 'true' for UCS with value >0xff (like // quotes, etc). + // New function might be better. - R.Kay if (newWord || - XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i])) +// XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i])) + (XAP_EncodingManager::get_instance()->char_cjk_prop(pSpan[i]).cjk)) { wCount.word++;