/*
* @(#)BreakIterator.java 1.35 03/12/19
*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
*/
/*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
package java.text;
import java.util.Vector;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.MissingResourceException;
import sun.text.resources.LocaleData;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.net.URL;
import java.io.InputStream;
import java.io.IOException;
import java.lang.ref.SoftReference;
import java.security.AccessController;
import java.security.PrivilegedAction;
/**
* The BreakIterator
class implements methods for finding
* the location of boundaries in text. Instances of BreakIterator
* maintain a current position and scan over text
* returning the index of characters where boundaries occur.
* Internally, BreakIterator
scans text using a
* CharacterIterator
, and is thus able to scan text held
* by any object implementing that protocol. A StringCharacterIterator
* is used to scan String
objects passed to setText
.
*
*
* You use the factory methods provided by this class to create
* instances of various types of break iterators. In particular,
* use getWordIterator
, getLineIterator
,
* getSentenceIterator
, and getCharacterIterator
* to create BreakIterator
s that perform
* word, line, sentence, and character boundary analysis respectively.
* A single BreakIterator
can work only on one unit
* (word, line, sentence, and so on). You must use a different iterator
* for each unit boundary analysis you wish to perform.
*
*
* Line boundary analysis determines where a text string can be * broken when line-wrapping. The mechanism correctly handles * punctuation and hyphenated words. * *
* Sentence boundary analysis allows selection with correct interpretation * of periods within numbers and abbreviations, and trailing punctuation * marks such as quotation marks and parentheses. * *
* Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols * or punctuation marks, have word-breaks on both sides. * *
* Character boundary analysis allows users to interact with characters * as they expect to, for example, when moving the cursor through a text * string. Character boundary analysis provides correct navigation of * through character strings, regardless of how the character is stored. * For example, an accented character might be stored as a base character * and a diacritical mark. What users consider to be a character can * differ between languages. * *
* BreakIterator
is intended for use with natural
* languages only. Do not use this class to tokenize a programming language.
*
*
* Examples:
* Creating and using text boundaries *
** * Print each element in order ** public static void main(String args[]) { * if (args.length == 1) { * String stringToExamine = args[0]; * //print each word in order * BreakIterator boundary = BreakIterator.getWordInstance(); * boundary.setText(stringToExamine); * printEachForward(boundary, stringToExamine); * //print each sentence in reverse order * boundary = BreakIterator.getSentenceInstance(Locale.US); * boundary.setText(stringToExamine); * printEachBackward(boundary, stringToExamine); * printFirst(boundary, stringToExamine); * printLast(boundary, stringToExamine); * } * } **
** * Print each element in reverse order ** public static void printEachForward(BreakIterator boundary, String source) { * int start = boundary.first(); * for (int end = boundary.next(); * end != BreakIterator.DONE; * start = end, end = boundary.next()) { * System.out.println(source.substring(start,end)); * } * } **
** * Print first element ** public static void printEachBackward(BreakIterator boundary, String source) { * int end = boundary.last(); * for (int start = boundary.previous(); * start != BreakIterator.DONE; * end = start, start = boundary.previous()) { * System.out.println(source.substring(start,end)); * } * } **
** * Print last element ** public static void printFirst(BreakIterator boundary, String source) { * int start = boundary.first(); * int end = boundary.next(); * System.out.println(source.substring(start,end)); * } **
** * Print the element at a specified position ** public static void printLast(BreakIterator boundary, String source) { * int end = boundary.last(); * int start = boundary.previous(); * System.out.println(source.substring(start,end)); * } **
** * Find the next word ** public static void printAt(BreakIterator boundary, int pos, String source) { * int end = boundary.following(pos); * int start = boundary.previous(); * System.out.println(source.substring(start,end)); * } **
** * @see CharacterIterator * */ public abstract class BreakIterator implements Cloneable { /** * Constructor. BreakIterator is stateless and has no default behavior. */ protected BreakIterator() { } /** * Create a copy of this iterator * @return A copy of this */ public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new InternalError(); } } /** * DONE is returned by previous() and next() after all valid * boundaries have been returned. */ public static final int DONE = -1; /** * Return the first boundary. The iterator's current position is set * to the first boundary. * @return The character index of the first text boundary. */ public abstract int first(); /** * Return the last boundary. The iterator's current position is set * to the last boundary. * @return The character index of the last text boundary. */ public abstract int last(); /** * Return the nth boundary from the current boundary * @param n which boundary to return. A value of 0 * does nothing. Negative values move to previous boundaries * and positive values move to later boundaries. * @return The index of the nth boundary from the current position. */ public abstract int next(int n); /** * Return the boundary following the current boundary. * @return The character index of the next text boundary or DONE if all * boundaries have been returned. Equivalent to next(1). */ public abstract int next(); /** * Return the boundary preceding the current boundary. * @return The character index of the previous text boundary or DONE if all * boundaries have been returned. */ public abstract int previous(); /** * Return the first boundary following the specified offset. * The value returned is always greater than the offset or * the value BreakIterator.DONE * @param offset the offset to begin scanning. Valid values * are determined by the CharacterIterator passed to * setText(). Invalid values cause * an IllegalArgumentException to be thrown. * @return The first boundary after the specified offset. */ public abstract int following(int offset); /** * Return the last boundary preceding the specfied offset. * The value returned is always less than the offset or the value * BreakIterator.DONE. * @param offset the offset to begin scanning. Valid values are * determined by the CharacterIterator passed to setText(). * Invalid values cause an IllegalArgumentException to be thrown. * @return The last boundary before the specified offset. * @since 1.2 */ public int preceding(int offset) { // NOTE: This implementation is here solely because we can't add new // abstract methods to an existing class. There is almost ALWAYS a // better, faster way to do this. int pos = following(offset); while (pos >= offset && pos != DONE) pos = previous(); return pos; } /** * Return true if the specified position is a boundary position. * @param offset the offset to check. * @return True if "offset" is a boundary position. * @since 1.2 */ public boolean isBoundary(int offset) { // NOTE: This implementation probably is wrong for most situations // because it fails to take into account the possibility that a // CharacterIterator passed to setText() may not have a begin offset // of 0. But since the abstract BreakIterator doesn't have that // knowledge, it assumes the begin offset is 0. If you subclass // BreakIterator, copy the SimpleTextBoundary implementation of this // function into your subclass. [This should have been abstract at // this level, but it's too late to fix that now.] if (offset == 0) return true; else return following(offset - 1) == offset; } /** * Return character index of the text boundary that was most recently * returned by next(), previous(), first(), or last() * @return The boundary most recently returned. */ public abstract int current(); /** * Get the text being scanned * @return the text being scanned */ public abstract CharacterIterator getText(); /** * Set a new text string to be scanned. The current scan * position is reset to first(). * @param newText new text to scan. */ public void setText(String newText) { setText(new StringCharacterIterator(newText)); } /** * Set a new text for scanning. The current scan * position is reset to first(). * @param newText new text to scan. */ public abstract void setText(CharacterIterator newText); private static final int CHARACTER_INDEX = 0; private static final int WORD_INDEX = 1; private static final int LINE_INDEX = 2; private static final int SENTENCE_INDEX = 3; private static final SoftReference[] iterCache = new SoftReference[4]; /** * Create BreakIterator for word-breaks using default locale. * Returns an instance of a BreakIterator implementing word breaks. * WordBreak is usefull for word selection (ex. double click) * @return A BreakIterator for word-breaks * @see java.util.Locale#getDefault */ public static BreakIterator getWordInstance() { return getWordInstance(Locale.getDefault()); } /** * Create BreakIterator for word-breaks using specified locale. * Returns an instance of a BreakIterator implementing word breaks. * WordBreak is usefull for word selection (ex. double click) * @param where the local. If a specific WordBreak is not * avaliable for the specified locale, a default WordBreak is returned. * @return A BreakIterator for word-breaks */ public static BreakIterator getWordInstance(Locale where) { return getBreakInstance(where, WORD_INDEX, "WordData", "WordDictionary"); } /** * Create BreakIterator for line-breaks using default locale. * Returns an instance of a BreakIterator implementing line breaks. Line * breaks are logically possible line breaks, actual line breaks are * usually determined based on display width. * LineBreak is useful for word wrapping text. * @return A BreakIterator for line-breaks * @see java.util.Locale#getDefault */ public static BreakIterator getLineInstance() { return getLineInstance(Locale.getDefault()); } /** * Create BreakIterator for line-breaks using specified locale. * Returns an instance of a BreakIterator implementing line breaks. Line * breaks are logically possible line breaks, actual line breaks are * usually determined based on display width. * LineBreak is useful for word wrapping text. * @param where the local. If a specific LineBreak is not * avaliable for the specified locale, a default LineBreak is returned. * @return A BreakIterator for line-breaks */ public static BreakIterator getLineInstance(Locale where) { return getBreakInstance(where, LINE_INDEX, "LineData", "LineDictionary"); } /** * Create BreakIterator for character-breaks using default locale * Returns an instance of a BreakIterator implementing character breaks. * Character breaks are boundaries of combining character sequences. * @return A BreakIterator for character-breaks * @see Locale#getDefault */ public static BreakIterator getCharacterInstance() { return getCharacterInstance(Locale.getDefault()); } /** * Create BreakIterator for character-breaks using specified locale * Returns an instance of a BreakIterator implementing character breaks. * Character breaks are boundaries of combining character sequences. * @param where the local. If a specific character break is not * avaliable for the specified local, a default character break is returned. * @return A BreakIterator for character-breaks */ public static BreakIterator getCharacterInstance(Locale where) { return getBreakInstance(where, CHARACTER_INDEX, "CharacterData", "CharacterDictionary"); } /** * Create BreakIterator for sentence-breaks using default locale * Returns an instance of a BreakIterator implementing sentence breaks. * @return A BreakIterator for sentence-breaks * @see java.util.Locale#getDefault */ public static BreakIterator getSentenceInstance() { return getSentenceInstance(Locale.getDefault()); } /** * Create BreakIterator for sentence-breaks using specified locale * Returns an instance of a BreakIterator implementing sentence breaks. * @param where the local. If a specific SentenceBreak is not * avaliable for the specified local, a default SentenceBreak is returned. * @return A BreakIterator for sentence-breaks */ public static BreakIterator getSentenceInstance(Locale where) { return getBreakInstance(where, SENTENCE_INDEX, "SentenceData", "SentenceDictionary"); } private static BreakIterator getBreakInstance(Locale where, int type, String dataName, String dictionaryName) { if (iterCache[type] != null) { BreakIteratorCache cache = (BreakIteratorCache) iterCache[type].get(); if (cache != null) { if (cache.getLocale().equals(where)) { return cache.createBreakInstance(); } } } BreakIterator result = createBreakInstance(where, type, dataName, dictionaryName); BreakIteratorCache cache = new BreakIteratorCache(where, result); iterCache[type] = new SoftReference(cache); return result; } private static ResourceBundle getBundle(final String baseName, final Locale locale) { return (ResourceBundle) AccessController.doPrivileged(new PrivilegedAction() { public Object run() { return ResourceBundle.getBundle(baseName, locale); } }); } private static BreakIterator createBreakInstance(Locale where, int type, String dataName, String dictionaryName) { ResourceBundle bundle = getBundle( "sun.text.resources.BreakIteratorInfo", where); String[] classNames = bundle.getStringArray("BreakIteratorClasses"); String dataFile = bundle.getString(dataName); try { if (classNames[type].equals("RuleBasedBreakIterator")) { return new RuleBasedBreakIterator(dataFile); } else if (classNames[type].equals("DictionaryBasedBreakIterator")) { String dictionaryFile = bundle.getString(dictionaryName); return new DictionaryBasedBreakIterator(dataFile, dictionaryFile); } else { throw new IllegalArgumentException("Invalid break iterator class \"" + classNames[type] + "\""); } } catch (Exception e) { throw new InternalError(e.toString()); } } /** * Returns an array of all locales for which the ** public static int nextWordStartAfter(int pos, String text) { * BreakIterator wb = BreakIterator.getWordInstance(); * wb.setText(text); * int last = wb.following(pos); * int current = wb.next(); * while (current != BreakIterator.DONE) { * for (int p = last; p < current; p++) { * if (Character.isLetter(text.codePointAt(p)) * return last; * } * last = current; * current = wb.next(); * } * return BreakIterator.DONE; * } ** (The iterator returned by BreakIterator.getWordInstance() is unique in that * the break positions it returns don't represent both the start and end of the * thing being iterated over. That is, a sentence-break iterator returns breaks * that each represent the end of one sentence and the beginning of the next. * With the word-break iterator, the characters between two boundaries might be a * word, or they might be the punctuation or whitespace between two words. The * above code uses a simple heuristic to determine which boundary is the beginning * of a word: If the characters between this boundary and the next boundary * include at least one letter (this can be an alphabetical letter, a CJK ideograph, * a Hangul syllable, a Kana character, etc.), then the text between this boundary * and the next is a word; otherwise, it's the material between words.) *
get*Instance
methods of this class can return
* localized instances.
* The array returned must contain at least a Locale
* instance equal to {@link java.util.Locale#US Locale.US}.
*
* @return An array of locales for which localized
* BreakIterator
instances are available.
*/
public static synchronized Locale[] getAvailableLocales()
{
//FIX ME - this is a known bug. It should return
//all locales.
return LocaleData.getAvailableLocales("NumberPatterns");
}
private static final class BreakIteratorCache {
private BreakIterator iter;
private Locale where;
BreakIteratorCache(Locale where, BreakIterator iter) {
this.where = where;
this.iter = (BreakIterator) iter.clone();
}
Locale getLocale() {
return where;
}
BreakIterator createBreakInstance() {
return (BreakIterator) iter.clone();
}
}
protected static long getLong(byte[] buf, int offset) {
long num = buf[offset]&0xFF;
for (int i = 1; i < 8; i++) {
num = num<<8 | (buf[offset+i]&0xFF);
}
return num;
}
protected static int getInt(byte[] buf, int offset) {
int num = buf[offset]&0xFF;
for (int i = 1; i < 4; i++) {
num = num<<8 | (buf[offset+i]&0xFF);
}
return num;
}
protected static short getShort(byte[] buf, int offset) {
short num = (short)(buf[offset]&0xFF);
num = (short)(num<<8 | (buf[offset+1]&0xFF));
return num;
}
}