001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064public abstract class CharMatcher implements Predicate<Character> { 065 /* 066 * N777777777NO 067 * N7777777777777N 068 * M777777777777777N 069 * $N877777777D77777M 070 * N M77777777ONND777M 071 * MN777777777NN D777 072 * N7ZN777777777NN ~M7778 073 * N777777777777MMNN88777N 074 * N777777777777MNZZZ7777O 075 * DZN7777O77777777777777 076 * N7OONND7777777D77777N 077 * 8$M++++?N???$77777$ 078 * M7++++N+M77777777N 079 * N77O777777777777$ M 080 * DNNM$$$$777777N D 081 * N$N:=N$777N7777M NZ 082 * 77Z::::N777777777 ODZZZ 083 * 77N::::::N77777777M NNZZZ$ 084 * $777:::::::77777777MN ZM8ZZZZZ 085 * 777M::::::Z7777777Z77 N++ZZZZNN 086 * 7777M:::::M7777777$777M $++IZZZZM 087 * M777$:::::N777777$M7777M +++++ZZZDN 088 * NN$::::::7777$$M777777N N+++ZZZZNZ 089 * N::::::N:7$O:77777777 N++++ZZZZN 090 * M::::::::::::N77777777+ +?+++++ZZZM 091 * 8::::::::::::D77777777M O+++++ZZ 092 * ::::::::::::M777777777N O+?D 093 * M:::::::::::M77777777778 77= 094 * D=::::::::::N7777777777N 777 095 * INN===::::::=77777777777N I777N 096 * ?777N========N7777777777787M N7777 097 * 77777$D======N77777777777N777N? N777777 098 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 099 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 100 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 101 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 102 * 7 :N MNN$$$$M$ $$$777$8 8D8I 103 * NMM.:7O 777777778 104 * 7777777MN 105 * M NO .7: 106 * M : M 107 * 8 108 */ 109 110 // Constant matcher factory methods 111 112 /** 113 * Matches any character. 114 * 115 * @since 19.0 (since 1.0 as constant {@code ANY}) 116 */ 117 public static CharMatcher any() { 118 return Any.INSTANCE; 119 } 120 121 /** 122 * Matches no characters. 123 * 124 * @since 19.0 (since 1.0 as constant {@code NONE}) 125 */ 126 public static CharMatcher none() { 127 return None.INSTANCE; 128 } 129 130 /** 131 * Determines whether a character is whitespace according to the latest Unicode standard, as 132 * illustrated <a 133 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 134 * This is not the same definition used by other Java APIs. (See a <a 135 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 136 * 137 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 138 * 139 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 140 * date. 141 * 142 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 143 */ 144 public static CharMatcher whitespace() { 145 return Whitespace.INSTANCE; 146 } 147 148 /** 149 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 150 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 151 * discussion of that term. 152 * 153 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 154 */ 155 public static CharMatcher breakingWhitespace() { 156 return BreakingWhitespace.INSTANCE; 157 } 158 159 /** 160 * Determines whether a character is ASCII, meaning that its code point is less than 128. 161 * 162 * @since 19.0 (since 1.0 as constant {@code ASCII}) 163 */ 164 public static CharMatcher ascii() { 165 return Ascii.INSTANCE; 166 } 167 168 /** 169 * Determines whether a character is a BMP digit according to <a 170 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 171 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 172 * 173 * @deprecated Many digits are supplementary characters; see the class documentation. 174 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 175 */ 176 @Deprecated 177 public static CharMatcher digit() { 178 return Digit.INSTANCE; 179 } 180 181 /** 182 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 183 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 184 * '9')}. 185 * 186 * @deprecated Many digits are supplementary characters; see the class documentation. 187 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 188 */ 189 @Deprecated 190 public static CharMatcher javaDigit() { 191 return JavaDigit.INSTANCE; 192 } 193 194 /** 195 * Determines whether a character is a BMP letter according to {@linkplain 196 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 197 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 198 * 199 * @deprecated Most letters are supplementary characters; see the class documentation. 200 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 201 */ 202 @Deprecated 203 public static CharMatcher javaLetter() { 204 return JavaLetter.INSTANCE; 205 } 206 207 /** 208 * Determines whether a character is a BMP letter or digit according to {@linkplain 209 * Character#isLetterOrDigit(char) Java's definition}. 210 * 211 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 212 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 213 */ 214 @Deprecated 215 public static CharMatcher javaLetterOrDigit() { 216 return JavaLetterOrDigit.INSTANCE; 217 } 218 219 /** 220 * Determines whether a BMP character is upper case according to {@linkplain 221 * Character#isUpperCase(char) Java's definition}. 222 * 223 * @deprecated Some uppercase characters are supplementary characters; see the class 224 * documentation. 225 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 226 */ 227 @Deprecated 228 public static CharMatcher javaUpperCase() { 229 return JavaUpperCase.INSTANCE; 230 } 231 232 /** 233 * Determines whether a BMP character is lower case according to {@linkplain 234 * Character#isLowerCase(char) Java's definition}. 235 * 236 * @deprecated Some lowercase characters are supplementary characters; see the class 237 * documentation. 238 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 239 */ 240 @Deprecated 241 public static CharMatcher javaLowerCase() { 242 return JavaLowerCase.INSTANCE; 243 } 244 245 /** 246 * Determines whether a character is an ISO control character as specified by {@link 247 * Character#isISOControl(char)}. 248 * 249 * <p>All ISO control codes are on the BMP and thus supported by this API. 250 * 251 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 252 */ 253 public static CharMatcher javaIsoControl() { 254 return JavaIsoControl.INSTANCE; 255 } 256 257 /** 258 * Determines whether a character is invisible; that is, if its Unicode category is any of 259 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 260 * PRIVATE_USE according to ICU4J. 261 * 262 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 263 * 264 * @deprecated Most invisible characters are supplementary characters; see the class 265 * documentation. 266 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 267 */ 268 @Deprecated 269 public static CharMatcher invisible() { 270 return Invisible.INSTANCE; 271 } 272 273 /** 274 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 275 * errs on the side of returning {@code false} (that is, it tends to assume a character is 276 * double-width). 277 * 278 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 279 * date. 280 * 281 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 282 * 283 * @deprecated Many such characters are supplementary characters; see the class documentation. 284 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 285 */ 286 @Deprecated 287 public static CharMatcher singleWidth() { 288 return SingleWidth.INSTANCE; 289 } 290 291 // Legacy constants 292 293 /** 294 * Determines whether a character is whitespace according to the latest Unicode 295 * standard, as illustrated 296 * <a 297 // href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 298 * This is not the same definition used by other Java APIs. (See a 299 * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of 300 * "whitespace"</a>.) 301 * 302 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant 303 * to keep it up to date. 304 * 305 * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be 306 * removed in June 2018. 307 */ 308 @com.google.common.annotations.Beta 309 @Deprecated 310 public static final CharMatcher WHITESPACE = whitespace(); 311 312 /** 313 * Determines whether a character is a breaking whitespace (that is, a whitespace 314 * which can be interpreted as a break between words for formatting purposes). See 315 * {@link #whitespace} for a discussion of that term. 316 * 317 * @since 2.0 318 * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled 319 * to be removed in June 2018. 320 */ 321 @com.google.common.annotations.Beta 322 @Deprecated 323 public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace(); 324 325 /** 326 * Determines whether a character is ASCII, meaning that its code point is less than 327 * 128. 328 * 329 * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be 330 * removed in June 2018. 331 */ 332 @com.google.common.annotations.Beta 333 @Deprecated 334 public static final CharMatcher ASCII = ascii(); 335 336 /** 337 * Determines whether a character is a digit according to 338 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D"> 339 * Unicode</a>. If you only care to match ASCII digits, you can use 340 * {@code inRange('0', '9')}. 341 * 342 * @deprecated Many digits are supplementary characters; see the class 343 * documentation. If you need to use this, use {@link #digit()} instead. This 344 * . constant is scheduled to be removed in June 2018. 345 */ 346 @com.google.common.annotations.Beta 347 @Deprecated 348 public static final CharMatcher DIGIT = digit(); 349 350 /** 351 * Determines whether a character is a digit according to 352 * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match 353 * ASCII digits, you can use {@code inRange('0', '9')}. 354 * 355 * @deprecated Many digits are supplementary characters; see the class 356 * documentation. If you need to use this, use {@link #javaDigit()} instead. 357 * This constant is scheduled to be removed in June 2018. 358 */ 359 @com.google.common.annotations.Beta 360 @Deprecated 361 public static final CharMatcher JAVA_DIGIT = javaDigit(); 362 363 /** 364 * Determines whether a character is a letter according to 365 * {@linkplain Character#isLetter(char) Java's definition}. If you only care to 366 * match letters of the Latin alphabet, you can use 367 * {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 368 * 369 * @deprecated Most letters are supplementary characters; see the class 370 * documentation. If you need to use this, use {@link #javaLetter()} instead. 371 * This constant is scheduled to be removed in June 2018. 372 */ 373 @com.google.common.annotations.Beta 374 @Deprecated 375 public static final CharMatcher JAVA_LETTER = javaLetter(); 376 377 /** 378 * Determines whether a character is a letter or digit according to 379 * {@linkplain Character#isLetterOrDigit(char) Java's definition}. 380 * 381 * @deprecated Most letters and digits are supplementary characters; see the class 382 * documentation. If you need to use this, use {@link #javaLetterOrDigit()} 383 * instead. This constant is scheduled to be removed in June 2018. 384 */ 385 @com.google.common.annotations.Beta 386 @Deprecated 387 public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit(); 388 389 /** 390 * Determines whether a character is upper case according to 391 * {@linkplain Character#isUpperCase(char) Java's definition}. 392 * 393 * @deprecated Some uppercase letters are supplementary characters; see the class 394 * documentation. If you need to use this, use {@link #javaUpperCase()} instead. 395 * This constant is scheduled to be removed in June 2018. 396 */ 397 @com.google.common.annotations.Beta 398 @Deprecated 399 public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase(); 400 401 /** 402 * Determines whether a character is lower case according to 403 * {@linkplain Character#isLowerCase(char) Java's definition}. 404 * 405 * @deprecated Some lowercase letters are supplementary characters; see the class 406 * documentation. If you need to use this, use {@link #javaLowerCase()} instead. 407 * This constant is scheduled to be removed in June 2018. 408 */ 409 @com.google.common.annotations.Beta 410 @Deprecated 411 public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase(); 412 413 /** 414 * Determines whether a character is an ISO control character as specified by 415 * {@link Character#isISOControl(char)}. 416 * 417 * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to 418 * be removed in June 2018. 419 */ 420 @com.google.common.annotations.Beta 421 @Deprecated 422 public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl(); 423 424 /** 425 * Determines whether a character is invisible; that is, if its Unicode category is 426 * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, 427 * SURROGATE, and PRIVATE_USE according to ICU4J. 428 * 429 * @deprecated Most invisible characters are supplementary characters; see the class 430 * documentation. If you need to use this, use {@link #invisible()} instead. 431 * This constant is scheduled to be removed in June 2018. 432 */ 433 @com.google.common.annotations.Beta 434 @Deprecated 435 public static final CharMatcher INVISIBLE = invisible(); 436 437 /** 438 * Determines whether a character is single-width (not double-width). When in doubt, 439 * this matcher errs on the side of returning {@code false} (that is, it tends to 440 * assume a character is double-width). 441 * 442 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to 443 * keep it up to date. 444 * 445 * @deprecated Many such characters are supplementary characters; see the class 446 * documentation. If you need to use this, use {@link #singleWidth()} instead. 447 * This constant is scheduled to be removed in June 2018. 448 */ 449 @com.google.common.annotations.Beta 450 @Deprecated 451 public static final CharMatcher SINGLE_WIDTH = singleWidth(); 452 453 /** 454 * Matches any character. 455 * 456 * @deprecated Use {@link #any()} instead. This constant is scheduled to be 457 * removed in June 2018. 458 */ 459 @com.google.common.annotations.Beta 460 @Deprecated 461 public static final CharMatcher ANY = any(); 462 463 /** 464 * Matches no characters. 465 * 466 * @deprecated Use {@link #none()} instead. This constant is scheduled to be 467 * removed in June 2018. 468 */ 469 @com.google.common.annotations.Beta 470 @Deprecated 471 public static final CharMatcher NONE = none(); 472 473 // Static factories 474 475 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 476 public static CharMatcher is(final char match) { 477 return new Is(match); 478 } 479 480 /** 481 * Returns a {@code char} matcher that matches any character except the BMP character specified. 482 * 483 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 484 */ 485 public static CharMatcher isNot(final char match) { 486 return new IsNot(match); 487 } 488 489 /** 490 * Returns a {@code char} matcher that matches any BMP character present in the given character 491 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 492 */ 493 public static CharMatcher anyOf(final CharSequence sequence) { 494 switch (sequence.length()) { 495 case 0: 496 return none(); 497 case 1: 498 return is(sequence.charAt(0)); 499 case 2: 500 return isEither(sequence.charAt(0), sequence.charAt(1)); 501 default: 502 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 503 // matcher? 504 return new AnyOf(sequence); 505 } 506 } 507 508 /** 509 * Returns a {@code char} matcher that matches any BMP character not present in the given 510 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 511 */ 512 public static CharMatcher noneOf(CharSequence sequence) { 513 return anyOf(sequence).negate(); 514 } 515 516 /** 517 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 518 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 519 * CharMatcher.inRange('a', 'z')}. 520 * 521 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 522 */ 523 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 524 return new InRange(startInclusive, endInclusive); 525 } 526 527 /** 528 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 529 * which operates on primitive {@code char} instances instead. 530 */ 531 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 532 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 533 } 534 535 // Constructors 536 537 /** 538 * Constructor for use by subclasses. When subclassing, you may want to override {@code 539 * toString()} to provide a useful description. 540 */ 541 protected CharMatcher() {} 542 543 // Abstract methods 544 545 /** Determines a true or false value for the given character. */ 546 public abstract boolean matches(char c); 547 548 // Non-static factories 549 550 /** Returns a matcher that matches any character not matched by this matcher. */ 551 // @Override under Java 8 but not under Java 7 552 //@Override 553 public CharMatcher negate() { 554 return new Negated(this); 555 } 556 557 /** 558 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 559 */ 560 public CharMatcher and(CharMatcher other) { 561 return new And(this, other); 562 } 563 564 /** 565 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 566 */ 567 public CharMatcher or(CharMatcher other) { 568 return new Or(this, other); 569 } 570 571 /** 572 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 573 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 574 * worthwhile only if the precomputed matcher is queried many thousands of times. 575 * 576 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 577 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 578 * worthwhile tradeoff in a browser. 579 */ 580 public CharMatcher precomputed() { 581 return Platform.precomputeCharMatcher(this); 582 } 583 584 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 585 586 /** 587 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 588 * on {@link Platform} so that we can have different behavior in GWT. 589 * 590 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 591 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 592 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 593 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 594 * matcher which is faster to query than the original. 595 */ 596 @GwtIncompatible // SmallCharMatcher 597 CharMatcher precomputedInternal() { 598 final BitSet table = new BitSet(); 599 setBits(table); 600 int totalCharacters = table.cardinality(); 601 if (totalCharacters * 2 <= DISTINCT_CHARS) { 602 return precomputedPositive(totalCharacters, table, toString()); 603 } else { 604 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 605 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 606 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 607 String suffix = ".negate()"; 608 final String description = toString(); 609 String negatedDescription = 610 description.endsWith(suffix) 611 ? description.substring(0, description.length() - suffix.length()) 612 : description + suffix; 613 return new NegatedFastMatcher( 614 precomputedPositive(negatedCharacters, table, negatedDescription)) { 615 @Override 616 public String toString() { 617 return description; 618 } 619 }; 620 } 621 } 622 623 /** 624 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 625 */ 626 @GwtIncompatible // SmallCharMatcher 627 private static CharMatcher precomputedPositive( 628 int totalCharacters, BitSet table, String description) { 629 switch (totalCharacters) { 630 case 0: 631 return none(); 632 case 1: 633 return is((char) table.nextSetBit(0)); 634 case 2: 635 char c1 = (char) table.nextSetBit(0); 636 char c2 = (char) table.nextSetBit(c1 + 1); 637 return isEither(c1, c2); 638 default: 639 return isSmall(totalCharacters, table.length()) 640 ? SmallCharMatcher.from(table, description) 641 : new BitSetMatcher(table, description); 642 } 643 } 644 645 @GwtIncompatible // SmallCharMatcher 646 private static boolean isSmall(int totalCharacters, int tableLength) { 647 return totalCharacters <= SmallCharMatcher.MAX_SIZE 648 && tableLength > (totalCharacters * 4 * Character.SIZE); 649 // err on the side of BitSetMatcher 650 } 651 652 /** Sets bits in {@code table} matched by this matcher. */ 653 @GwtIncompatible // used only from other GwtIncompatible code 654 void setBits(BitSet table) { 655 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 656 if (matches((char) c)) { 657 table.set(c); 658 } 659 } 660 } 661 662 // Text processing routines 663 664 /** 665 * Returns {@code true} if a character sequence contains at least one matching BMP character. 666 * Equivalent to {@code !matchesNoneOf(sequence)}. 667 * 668 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 669 * character, until this returns {@code true} or the end is reached. 670 * 671 * @param sequence the character sequence to examine, possibly empty 672 * @return {@code true} if this matcher matches at least one character in the sequence 673 * @since 8.0 674 */ 675 public boolean matchesAnyOf(CharSequence sequence) { 676 return !matchesNoneOf(sequence); 677 } 678 679 /** 680 * Returns {@code true} if a character sequence contains only matching BMP characters. 681 * 682 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 683 * character, until this returns {@code false} or the end is reached. 684 * 685 * @param sequence the character sequence to examine, possibly empty 686 * @return {@code true} if this matcher matches every character in the sequence, including when 687 * the sequence is empty 688 */ 689 public boolean matchesAllOf(CharSequence sequence) { 690 for (int i = sequence.length() - 1; i >= 0; i--) { 691 if (!matches(sequence.charAt(i))) { 692 return false; 693 } 694 } 695 return true; 696 } 697 698 /** 699 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 700 * {@code !matchesAnyOf(sequence)}. 701 * 702 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 703 * character, until this returns {@code true} or the end is reached. 704 * 705 * @param sequence the character sequence to examine, possibly empty 706 * @return {@code true} if this matcher matches no characters in the sequence, including when the 707 * sequence is empty 708 */ 709 public boolean matchesNoneOf(CharSequence sequence) { 710 return indexIn(sequence) == -1; 711 } 712 713 /** 714 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 715 * no matching character is present. 716 * 717 * <p>The default implementation iterates over the sequence in forward order calling {@link 718 * #matches} for each character. 719 * 720 * @param sequence the character sequence to examine from the beginning 721 * @return an index, or {@code -1} if no character matches 722 */ 723 public int indexIn(CharSequence sequence) { 724 return indexIn(sequence, 0); 725 } 726 727 /** 728 * Returns the index of the first matching BMP character in a character sequence, starting from a 729 * given position, or {@code -1} if no character matches after that position. 730 * 731 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 732 * start}, calling {@link #matches} for each character. 733 * 734 * @param sequence the character sequence to examine 735 * @param start the first index to examine; must be nonnegative and no greater than {@code 736 * sequence.length()} 737 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 738 * or {@code -1} if no character matches 739 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 740 * sequence.length()} 741 */ 742 public int indexIn(CharSequence sequence, int start) { 743 int length = sequence.length(); 744 checkPositionIndex(start, length); 745 for (int i = start; i < length; i++) { 746 if (matches(sequence.charAt(i))) { 747 return i; 748 } 749 } 750 return -1; 751 } 752 753 /** 754 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 755 * no matching character is present. 756 * 757 * <p>The default implementation iterates over the sequence in reverse order calling {@link 758 * #matches} for each character. 759 * 760 * @param sequence the character sequence to examine from the end 761 * @return an index, or {@code -1} if no character matches 762 */ 763 public int lastIndexIn(CharSequence sequence) { 764 for (int i = sequence.length() - 1; i >= 0; i--) { 765 if (matches(sequence.charAt(i))) { 766 return i; 767 } 768 } 769 return -1; 770 } 771 772 /** 773 * Returns the number of matching {@code char}s found in a character sequence. 774 * 775 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 776 */ 777 public int countIn(CharSequence sequence) { 778 int count = 0; 779 for (int i = 0; i < sequence.length(); i++) { 780 if (matches(sequence.charAt(i))) { 781 count++; 782 } 783 } 784 return count; 785 } 786 787 /** 788 * Returns a string containing all non-matching characters of a character sequence, in order. For 789 * example: 790 * 791 * <pre>{@code 792 * CharMatcher.is('a').removeFrom("bazaar") 793 * }</pre> 794 * 795 * ... returns {@code "bzr"}. 796 */ 797 public String removeFrom(CharSequence sequence) { 798 String string = sequence.toString(); 799 int pos = indexIn(string); 800 if (pos == -1) { 801 return string; 802 } 803 804 char[] chars = string.toCharArray(); 805 int spread = 1; 806 807 // This unusual loop comes from extensive benchmarking 808 OUT: 809 while (true) { 810 pos++; 811 while (true) { 812 if (pos == chars.length) { 813 break OUT; 814 } 815 if (matches(chars[pos])) { 816 break; 817 } 818 chars[pos - spread] = chars[pos]; 819 pos++; 820 } 821 spread++; 822 } 823 return new String(chars, 0, pos - spread); 824 } 825 826 /** 827 * Returns a string containing all matching BMP characters of a character sequence, in order. For 828 * example: 829 * 830 * <pre>{@code 831 * CharMatcher.is('a').retainFrom("bazaar") 832 * }</pre> 833 * 834 * ... returns {@code "aaa"}. 835 */ 836 public String retainFrom(CharSequence sequence) { 837 return negate().removeFrom(sequence); 838 } 839 840 /** 841 * Returns a string copy of the input character sequence, with each matching BMP character 842 * replaced by a given replacement character. For example: 843 * 844 * <pre>{@code 845 * CharMatcher.is('a').replaceFrom("radar", 'o') 846 * }</pre> 847 * 848 * ... returns {@code "rodor"}. 849 * 850 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 851 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 852 * character. 853 * 854 * @param sequence the character sequence to replace matching characters in 855 * @param replacement the character to append to the result string in place of each matching 856 * character in {@code sequence} 857 * @return the new string 858 */ 859 public String replaceFrom(CharSequence sequence, char replacement) { 860 String string = sequence.toString(); 861 int pos = indexIn(string); 862 if (pos == -1) { 863 return string; 864 } 865 char[] chars = string.toCharArray(); 866 chars[pos] = replacement; 867 for (int i = pos + 1; i < chars.length; i++) { 868 if (matches(chars[i])) { 869 chars[i] = replacement; 870 } 871 } 872 return new String(chars); 873 } 874 875 /** 876 * Returns a string copy of the input character sequence, with each matching BMP character 877 * replaced by a given replacement sequence. For example: 878 * 879 * <pre>{@code 880 * CharMatcher.is('a').replaceFrom("yaha", "oo") 881 * }</pre> 882 * 883 * ... returns {@code "yoohoo"}. 884 * 885 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 886 * off calling {@link #replaceFrom(CharSequence, char)} directly. 887 * 888 * @param sequence the character sequence to replace matching characters in 889 * @param replacement the characters to append to the result string in place of each matching 890 * character in {@code sequence} 891 * @return the new string 892 */ 893 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 894 int replacementLen = replacement.length(); 895 if (replacementLen == 0) { 896 return removeFrom(sequence); 897 } 898 if (replacementLen == 1) { 899 return replaceFrom(sequence, replacement.charAt(0)); 900 } 901 902 String string = sequence.toString(); 903 int pos = indexIn(string); 904 if (pos == -1) { 905 return string; 906 } 907 908 int len = string.length(); 909 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 910 911 int oldpos = 0; 912 do { 913 buf.append(string, oldpos, pos); 914 buf.append(replacement); 915 oldpos = pos + 1; 916 pos = indexIn(string, oldpos); 917 } while (pos != -1); 918 919 buf.append(string, oldpos, len); 920 return buf.toString(); 921 } 922 923 /** 924 * Returns a substring of the input character sequence that omits all matching BMP characters from 925 * the beginning and from the end of the string. For example: 926 * 927 * <pre>{@code 928 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 929 * }</pre> 930 * 931 * ... returns {@code "cat"}. 932 * 933 * <p>Note that: 934 * 935 * <pre>{@code 936 * CharMatcher.inRange('\0', ' ').trimFrom(str) 937 * }</pre> 938 * 939 * ... is equivalent to {@link String#trim()}. 940 */ 941 public String trimFrom(CharSequence sequence) { 942 int len = sequence.length(); 943 int first; 944 int last; 945 946 for (first = 0; first < len; first++) { 947 if (!matches(sequence.charAt(first))) { 948 break; 949 } 950 } 951 for (last = len - 1; last > first; last--) { 952 if (!matches(sequence.charAt(last))) { 953 break; 954 } 955 } 956 957 return sequence.subSequence(first, last + 1).toString(); 958 } 959 960 /** 961 * Returns a substring of the input character sequence that omits all matching BMP characters from 962 * the beginning of the string. For example: 963 * 964 * <pre>{@code 965 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 966 * }</pre> 967 * 968 * ... returns {@code "catbab"}. 969 */ 970 public String trimLeadingFrom(CharSequence sequence) { 971 int len = sequence.length(); 972 for (int first = 0; first < len; first++) { 973 if (!matches(sequence.charAt(first))) { 974 return sequence.subSequence(first, len).toString(); 975 } 976 } 977 return ""; 978 } 979 980 /** 981 * Returns a substring of the input character sequence that omits all matching BMP characters from 982 * the end of the string. For example: 983 * 984 * <pre>{@code 985 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 986 * }</pre> 987 * 988 * ... returns {@code "abacat"}. 989 */ 990 public String trimTrailingFrom(CharSequence sequence) { 991 int len = sequence.length(); 992 for (int last = len - 1; last >= 0; last--) { 993 if (!matches(sequence.charAt(last))) { 994 return sequence.subSequence(0, last + 1).toString(); 995 } 996 } 997 return ""; 998 } 999 1000 /** 1001 * Returns a string copy of the input character sequence, with each group of consecutive matching 1002 * BMP characters replaced by a single replacement character. For example: 1003 * 1004 * <pre>{@code 1005 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 1006 * }</pre> 1007 * 1008 * ... returns {@code "b-p-r"}. 1009 * 1010 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1011 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1012 * character. 1013 * 1014 * @param sequence the character sequence to replace matching groups of characters in 1015 * @param replacement the character to append to the result string in place of each group of 1016 * matching characters in {@code sequence} 1017 * @return the new string 1018 */ 1019 public String collapseFrom(CharSequence sequence, char replacement) { 1020 // This implementation avoids unnecessary allocation. 1021 int len = sequence.length(); 1022 for (int i = 0; i < len; i++) { 1023 char c = sequence.charAt(i); 1024 if (matches(c)) { 1025 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1026 // a no-op replacement 1027 i++; 1028 } else { 1029 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 1030 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1031 } 1032 } 1033 } 1034 // no replacement needed 1035 return sequence.toString(); 1036 } 1037 1038 /** 1039 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1040 * groups of matching BMP characters at the start or end of the sequence are removed without 1041 * replacement. 1042 */ 1043 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1044 // This implementation avoids unnecessary allocation. 1045 int len = sequence.length(); 1046 int first = 0; 1047 int last = len - 1; 1048 1049 while (first < len && matches(sequence.charAt(first))) { 1050 first++; 1051 } 1052 1053 while (last > first && matches(sequence.charAt(last))) { 1054 last--; 1055 } 1056 1057 return (first == 0 && last == len - 1) 1058 ? collapseFrom(sequence, replacement) 1059 : finishCollapseFrom( 1060 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 1061 } 1062 1063 private String finishCollapseFrom( 1064 CharSequence sequence, 1065 int start, 1066 int end, 1067 char replacement, 1068 StringBuilder builder, 1069 boolean inMatchingGroup) { 1070 for (int i = start; i < end; i++) { 1071 char c = sequence.charAt(i); 1072 if (matches(c)) { 1073 if (!inMatchingGroup) { 1074 builder.append(replacement); 1075 inMatchingGroup = true; 1076 } 1077 } else { 1078 builder.append(c); 1079 inMatchingGroup = false; 1080 } 1081 } 1082 return builder.toString(); 1083 } 1084 1085 /** 1086 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1087 * instead. 1088 */ 1089 @Deprecated 1090 @Override 1091 public boolean apply(Character character) { 1092 return matches(character); 1093 } 1094 1095 /** 1096 * Returns a string representation of this {@code CharMatcher}, such as {@code 1097 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1098 */ 1099 @Override 1100 public String toString() { 1101 return super.toString(); 1102 } 1103 1104 /** 1105 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 1106 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 1107 */ 1108 private static String showCharacter(char c) { 1109 String hex = "0123456789ABCDEF"; 1110 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 1111 for (int i = 0; i < 4; i++) { 1112 tmp[5 - i] = hex.charAt(c & 0xF); 1113 c = (char) (c >> 4); 1114 } 1115 return String.copyValueOf(tmp); 1116 } 1117 1118 // Fast matchers 1119 1120 /** A matcher for which precomputation will not yield any significant benefit. */ 1121 abstract static class FastMatcher extends CharMatcher { 1122 1123 @Override 1124 public final CharMatcher precomputed() { 1125 return this; 1126 } 1127 1128 @Override 1129 public CharMatcher negate() { 1130 return new NegatedFastMatcher(this); 1131 } 1132 } 1133 1134 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 1135 abstract static class NamedFastMatcher extends FastMatcher { 1136 1137 private final String description; 1138 1139 NamedFastMatcher(String description) { 1140 this.description = checkNotNull(description); 1141 } 1142 1143 @Override 1144 public final String toString() { 1145 return description; 1146 } 1147 } 1148 1149 /** Negation of a {@link FastMatcher}. */ 1150 static class NegatedFastMatcher extends Negated { 1151 1152 NegatedFastMatcher(CharMatcher original) { 1153 super(original); 1154 } 1155 1156 @Override 1157 public final CharMatcher precomputed() { 1158 return this; 1159 } 1160 } 1161 1162 /** Fast matcher using a {@link BitSet} table of matching characters. */ 1163 @GwtIncompatible // used only from other GwtIncompatible code 1164 private static final class BitSetMatcher extends NamedFastMatcher { 1165 1166 private final BitSet table; 1167 1168 private BitSetMatcher(BitSet table, String description) { 1169 super(description); 1170 if (table.length() + Long.SIZE < table.size()) { 1171 table = (BitSet) table.clone(); 1172 // If only we could actually call BitSet.trimToSize() ourselves... 1173 } 1174 this.table = table; 1175 } 1176 1177 @Override 1178 public boolean matches(char c) { 1179 return table.get(c); 1180 } 1181 1182 @Override 1183 void setBits(BitSet bitSet) { 1184 bitSet.or(table); 1185 } 1186 } 1187 1188 // Static constant implementation classes 1189 1190 /** Implementation of {@link #any()}. */ 1191 private static final class Any extends NamedFastMatcher { 1192 1193 static final Any INSTANCE = new Any(); 1194 1195 private Any() { 1196 super("CharMatcher.any()"); 1197 } 1198 1199 @Override 1200 public boolean matches(char c) { 1201 return true; 1202 } 1203 1204 @Override 1205 public int indexIn(CharSequence sequence) { 1206 return (sequence.length() == 0) ? -1 : 0; 1207 } 1208 1209 @Override 1210 public int indexIn(CharSequence sequence, int start) { 1211 int length = sequence.length(); 1212 checkPositionIndex(start, length); 1213 return (start == length) ? -1 : start; 1214 } 1215 1216 @Override 1217 public int lastIndexIn(CharSequence sequence) { 1218 return sequence.length() - 1; 1219 } 1220 1221 @Override 1222 public boolean matchesAllOf(CharSequence sequence) { 1223 checkNotNull(sequence); 1224 return true; 1225 } 1226 1227 @Override 1228 public boolean matchesNoneOf(CharSequence sequence) { 1229 return sequence.length() == 0; 1230 } 1231 1232 @Override 1233 public String removeFrom(CharSequence sequence) { 1234 checkNotNull(sequence); 1235 return ""; 1236 } 1237 1238 @Override 1239 public String replaceFrom(CharSequence sequence, char replacement) { 1240 char[] array = new char[sequence.length()]; 1241 Arrays.fill(array, replacement); 1242 return new String(array); 1243 } 1244 1245 @Override 1246 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1247 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1248 for (int i = 0; i < sequence.length(); i++) { 1249 result.append(replacement); 1250 } 1251 return result.toString(); 1252 } 1253 1254 @Override 1255 public String collapseFrom(CharSequence sequence, char replacement) { 1256 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1257 } 1258 1259 @Override 1260 public String trimFrom(CharSequence sequence) { 1261 checkNotNull(sequence); 1262 return ""; 1263 } 1264 1265 @Override 1266 public int countIn(CharSequence sequence) { 1267 return sequence.length(); 1268 } 1269 1270 @Override 1271 public CharMatcher and(CharMatcher other) { 1272 return checkNotNull(other); 1273 } 1274 1275 @Override 1276 public CharMatcher or(CharMatcher other) { 1277 checkNotNull(other); 1278 return this; 1279 } 1280 1281 @Override 1282 public CharMatcher negate() { 1283 return none(); 1284 } 1285 } 1286 1287 /** Implementation of {@link #none()}. */ 1288 private static final class None extends NamedFastMatcher { 1289 1290 static final None INSTANCE = new None(); 1291 1292 private None() { 1293 super("CharMatcher.none()"); 1294 } 1295 1296 @Override 1297 public boolean matches(char c) { 1298 return false; 1299 } 1300 1301 @Override 1302 public int indexIn(CharSequence sequence) { 1303 checkNotNull(sequence); 1304 return -1; 1305 } 1306 1307 @Override 1308 public int indexIn(CharSequence sequence, int start) { 1309 int length = sequence.length(); 1310 checkPositionIndex(start, length); 1311 return -1; 1312 } 1313 1314 @Override 1315 public int lastIndexIn(CharSequence sequence) { 1316 checkNotNull(sequence); 1317 return -1; 1318 } 1319 1320 @Override 1321 public boolean matchesAllOf(CharSequence sequence) { 1322 return sequence.length() == 0; 1323 } 1324 1325 @Override 1326 public boolean matchesNoneOf(CharSequence sequence) { 1327 checkNotNull(sequence); 1328 return true; 1329 } 1330 1331 @Override 1332 public String removeFrom(CharSequence sequence) { 1333 return sequence.toString(); 1334 } 1335 1336 @Override 1337 public String replaceFrom(CharSequence sequence, char replacement) { 1338 return sequence.toString(); 1339 } 1340 1341 @Override 1342 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1343 checkNotNull(replacement); 1344 return sequence.toString(); 1345 } 1346 1347 @Override 1348 public String collapseFrom(CharSequence sequence, char replacement) { 1349 return sequence.toString(); 1350 } 1351 1352 @Override 1353 public String trimFrom(CharSequence sequence) { 1354 return sequence.toString(); 1355 } 1356 1357 @Override 1358 public String trimLeadingFrom(CharSequence sequence) { 1359 return sequence.toString(); 1360 } 1361 1362 @Override 1363 public String trimTrailingFrom(CharSequence sequence) { 1364 return sequence.toString(); 1365 } 1366 1367 @Override 1368 public int countIn(CharSequence sequence) { 1369 checkNotNull(sequence); 1370 return 0; 1371 } 1372 1373 @Override 1374 public CharMatcher and(CharMatcher other) { 1375 checkNotNull(other); 1376 return this; 1377 } 1378 1379 @Override 1380 public CharMatcher or(CharMatcher other) { 1381 return checkNotNull(other); 1382 } 1383 1384 @Override 1385 public CharMatcher negate() { 1386 return any(); 1387 } 1388 } 1389 1390 /** Implementation of {@link #whitespace()}. */ 1391 @VisibleForTesting 1392 static final class Whitespace extends NamedFastMatcher { 1393 1394 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1395 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1396 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1397 // other use-cases, the reduction in readability isn't worth it. 1398 static final String TABLE = 1399 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1400 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1401 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1402 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1403 static final int MULTIPLIER = 1682554634; 1404 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1405 1406 static final Whitespace INSTANCE = new Whitespace(); 1407 1408 Whitespace() { 1409 super("CharMatcher.whitespace()"); 1410 } 1411 1412 @Override 1413 public boolean matches(char c) { 1414 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1415 } 1416 1417 @GwtIncompatible // used only from other GwtIncompatible code 1418 @Override 1419 void setBits(BitSet table) { 1420 for (int i = 0; i < TABLE.length(); i++) { 1421 table.set(TABLE.charAt(i)); 1422 } 1423 } 1424 } 1425 1426 /** Implementation of {@link #breakingWhitespace()}. */ 1427 private static final class BreakingWhitespace extends CharMatcher { 1428 1429 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1430 1431 @Override 1432 public boolean matches(char c) { 1433 switch (c) { 1434 case '\t': 1435 case '\n': 1436 case '\013': 1437 case '\f': 1438 case '\r': 1439 case ' ': 1440 case '\u0085': 1441 case '\u1680': 1442 case '\u2028': 1443 case '\u2029': 1444 case '\u205f': 1445 case '\u3000': 1446 return true; 1447 case '\u2007': 1448 return false; 1449 default: 1450 return c >= '\u2000' && c <= '\u200a'; 1451 } 1452 } 1453 1454 @Override 1455 public String toString() { 1456 return "CharMatcher.breakingWhitespace()"; 1457 } 1458 } 1459 1460 /** Implementation of {@link #ascii()}. */ 1461 private static final class Ascii extends NamedFastMatcher { 1462 1463 static final Ascii INSTANCE = new Ascii(); 1464 1465 Ascii() { 1466 super("CharMatcher.ascii()"); 1467 } 1468 1469 @Override 1470 public boolean matches(char c) { 1471 return c <= '\u007f'; 1472 } 1473 } 1474 1475 /** Implementation that matches characters that fall within multiple ranges. */ 1476 private static class RangesMatcher extends CharMatcher { 1477 1478 private final String description; 1479 private final char[] rangeStarts; 1480 private final char[] rangeEnds; 1481 1482 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1483 this.description = description; 1484 this.rangeStarts = rangeStarts; 1485 this.rangeEnds = rangeEnds; 1486 checkArgument(rangeStarts.length == rangeEnds.length); 1487 for (int i = 0; i < rangeStarts.length; i++) { 1488 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1489 if (i + 1 < rangeStarts.length) { 1490 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1491 } 1492 } 1493 } 1494 1495 @Override 1496 public boolean matches(char c) { 1497 int index = Arrays.binarySearch(rangeStarts, c); 1498 if (index >= 0) { 1499 return true; 1500 } else { 1501 index = ~index - 1; 1502 return index >= 0 && c <= rangeEnds[index]; 1503 } 1504 } 1505 1506 @Override 1507 public String toString() { 1508 return description; 1509 } 1510 } 1511 1512 /** Implementation of {@link #digit()}. */ 1513 private static final class Digit extends RangesMatcher { 1514 // Plug the following UnicodeSet pattern into 1515 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1516 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1517 // and get the zeroes from there. 1518 1519 // Must be in ascending order. 1520 private static final String ZEROES = 1521 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1522 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1523 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1524 1525 private static char[] zeroes() { 1526 return ZEROES.toCharArray(); 1527 } 1528 1529 private static char[] nines() { 1530 char[] nines = new char[ZEROES.length()]; 1531 for (int i = 0; i < ZEROES.length(); i++) { 1532 nines[i] = (char) (ZEROES.charAt(i) + 9); 1533 } 1534 return nines; 1535 } 1536 1537 static final Digit INSTANCE = new Digit(); 1538 1539 private Digit() { 1540 super("CharMatcher.digit()", zeroes(), nines()); 1541 } 1542 } 1543 1544 /** Implementation of {@link #javaDigit()}. */ 1545 private static final class JavaDigit extends CharMatcher { 1546 1547 static final JavaDigit INSTANCE = new JavaDigit(); 1548 1549 @Override 1550 public boolean matches(char c) { 1551 return Character.isDigit(c); 1552 } 1553 1554 @Override 1555 public String toString() { 1556 return "CharMatcher.javaDigit()"; 1557 } 1558 } 1559 1560 /** Implementation of {@link #javaLetter()}. */ 1561 private static final class JavaLetter extends CharMatcher { 1562 1563 static final JavaLetter INSTANCE = new JavaLetter(); 1564 1565 @Override 1566 public boolean matches(char c) { 1567 return Character.isLetter(c); 1568 } 1569 1570 @Override 1571 public String toString() { 1572 return "CharMatcher.javaLetter()"; 1573 } 1574 } 1575 1576 /** Implementation of {@link #javaLetterOrDigit()}. */ 1577 private static final class JavaLetterOrDigit extends CharMatcher { 1578 1579 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1580 1581 @Override 1582 public boolean matches(char c) { 1583 return Character.isLetterOrDigit(c); 1584 } 1585 1586 @Override 1587 public String toString() { 1588 return "CharMatcher.javaLetterOrDigit()"; 1589 } 1590 } 1591 1592 /** Implementation of {@link #javaUpperCase()}. */ 1593 private static final class JavaUpperCase extends CharMatcher { 1594 1595 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1596 1597 @Override 1598 public boolean matches(char c) { 1599 return Character.isUpperCase(c); 1600 } 1601 1602 @Override 1603 public String toString() { 1604 return "CharMatcher.javaUpperCase()"; 1605 } 1606 } 1607 1608 /** Implementation of {@link #javaLowerCase()}. */ 1609 private static final class JavaLowerCase extends CharMatcher { 1610 1611 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1612 1613 @Override 1614 public boolean matches(char c) { 1615 return Character.isLowerCase(c); 1616 } 1617 1618 @Override 1619 public String toString() { 1620 return "CharMatcher.javaLowerCase()"; 1621 } 1622 } 1623 1624 /** Implementation of {@link #javaIsoControl()}. */ 1625 private static final class JavaIsoControl extends NamedFastMatcher { 1626 1627 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1628 1629 private JavaIsoControl() { 1630 super("CharMatcher.javaIsoControl()"); 1631 } 1632 1633 @Override 1634 public boolean matches(char c) { 1635 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1636 } 1637 } 1638 1639 /** Implementation of {@link #invisible()}. */ 1640 private static final class Invisible extends RangesMatcher { 1641 // Plug the following UnicodeSet pattern into 1642 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1643 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1644 // with the "Abbreviate" option, and get the ranges from there. 1645 private static final String RANGE_STARTS = 1646 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1647 + "\u3000\ud800\ufeff\ufff9"; 1648 private static final String RANGE_ENDS = // inclusive ends 1649 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1650 + "\u3000\uf8ff\ufeff\ufffb"; 1651 1652 static final Invisible INSTANCE = new Invisible(); 1653 1654 private Invisible() { 1655 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1656 } 1657 } 1658 1659 /** Implementation of {@link #singleWidth()}. */ 1660 private static final class SingleWidth extends RangesMatcher { 1661 1662 static final SingleWidth INSTANCE = new SingleWidth(); 1663 1664 private SingleWidth() { 1665 super( 1666 "CharMatcher.singleWidth()", 1667 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1668 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1669 } 1670 } 1671 1672 // Non-static factory implementation classes 1673 1674 /** Implementation of {@link #negate()}. */ 1675 private static class Negated extends CharMatcher { 1676 1677 final CharMatcher original; 1678 1679 Negated(CharMatcher original) { 1680 this.original = checkNotNull(original); 1681 } 1682 1683 @Override 1684 public boolean matches(char c) { 1685 return !original.matches(c); 1686 } 1687 1688 @Override 1689 public boolean matchesAllOf(CharSequence sequence) { 1690 return original.matchesNoneOf(sequence); 1691 } 1692 1693 @Override 1694 public boolean matchesNoneOf(CharSequence sequence) { 1695 return original.matchesAllOf(sequence); 1696 } 1697 1698 @Override 1699 public int countIn(CharSequence sequence) { 1700 return sequence.length() - original.countIn(sequence); 1701 } 1702 1703 @GwtIncompatible // used only from other GwtIncompatible code 1704 @Override 1705 void setBits(BitSet table) { 1706 BitSet tmp = new BitSet(); 1707 original.setBits(tmp); 1708 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1709 table.or(tmp); 1710 } 1711 1712 @Override 1713 public CharMatcher negate() { 1714 return original; 1715 } 1716 1717 @Override 1718 public String toString() { 1719 return original + ".negate()"; 1720 } 1721 } 1722 1723 /** Implementation of {@link #and(CharMatcher)}. */ 1724 private static final class And extends CharMatcher { 1725 1726 final CharMatcher first; 1727 final CharMatcher second; 1728 1729 And(CharMatcher a, CharMatcher b) { 1730 first = checkNotNull(a); 1731 second = checkNotNull(b); 1732 } 1733 1734 @Override 1735 public boolean matches(char c) { 1736 return first.matches(c) && second.matches(c); 1737 } 1738 1739 @GwtIncompatible // used only from other GwtIncompatible code 1740 @Override 1741 void setBits(BitSet table) { 1742 BitSet tmp1 = new BitSet(); 1743 first.setBits(tmp1); 1744 BitSet tmp2 = new BitSet(); 1745 second.setBits(tmp2); 1746 tmp1.and(tmp2); 1747 table.or(tmp1); 1748 } 1749 1750 @Override 1751 public String toString() { 1752 return "CharMatcher.and(" + first + ", " + second + ")"; 1753 } 1754 } 1755 1756 /** Implementation of {@link #or(CharMatcher)}. */ 1757 private static final class Or extends CharMatcher { 1758 1759 final CharMatcher first; 1760 final CharMatcher second; 1761 1762 Or(CharMatcher a, CharMatcher b) { 1763 first = checkNotNull(a); 1764 second = checkNotNull(b); 1765 } 1766 1767 @GwtIncompatible // used only from other GwtIncompatible code 1768 @Override 1769 void setBits(BitSet table) { 1770 first.setBits(table); 1771 second.setBits(table); 1772 } 1773 1774 @Override 1775 public boolean matches(char c) { 1776 return first.matches(c) || second.matches(c); 1777 } 1778 1779 @Override 1780 public String toString() { 1781 return "CharMatcher.or(" + first + ", " + second + ")"; 1782 } 1783 } 1784 1785 // Static factory implementations 1786 1787 /** Implementation of {@link #is(char)}. */ 1788 private static final class Is extends FastMatcher { 1789 1790 private final char match; 1791 1792 Is(char match) { 1793 this.match = match; 1794 } 1795 1796 @Override 1797 public boolean matches(char c) { 1798 return c == match; 1799 } 1800 1801 @Override 1802 public String replaceFrom(CharSequence sequence, char replacement) { 1803 return sequence.toString().replace(match, replacement); 1804 } 1805 1806 @Override 1807 public CharMatcher and(CharMatcher other) { 1808 return other.matches(match) ? this : none(); 1809 } 1810 1811 @Override 1812 public CharMatcher or(CharMatcher other) { 1813 return other.matches(match) ? other : super.or(other); 1814 } 1815 1816 @Override 1817 public CharMatcher negate() { 1818 return isNot(match); 1819 } 1820 1821 @GwtIncompatible // used only from other GwtIncompatible code 1822 @Override 1823 void setBits(BitSet table) { 1824 table.set(match); 1825 } 1826 1827 @Override 1828 public String toString() { 1829 return "CharMatcher.is('" + showCharacter(match) + "')"; 1830 } 1831 } 1832 1833 /** Implementation of {@link #isNot(char)}. */ 1834 private static final class IsNot extends FastMatcher { 1835 1836 private final char match; 1837 1838 IsNot(char match) { 1839 this.match = match; 1840 } 1841 1842 @Override 1843 public boolean matches(char c) { 1844 return c != match; 1845 } 1846 1847 @Override 1848 public CharMatcher and(CharMatcher other) { 1849 return other.matches(match) ? super.and(other) : other; 1850 } 1851 1852 @Override 1853 public CharMatcher or(CharMatcher other) { 1854 return other.matches(match) ? any() : this; 1855 } 1856 1857 @GwtIncompatible // used only from other GwtIncompatible code 1858 @Override 1859 void setBits(BitSet table) { 1860 table.set(0, match); 1861 table.set(match + 1, Character.MAX_VALUE + 1); 1862 } 1863 1864 @Override 1865 public CharMatcher negate() { 1866 return is(match); 1867 } 1868 1869 @Override 1870 public String toString() { 1871 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1872 } 1873 } 1874 1875 private static CharMatcher.IsEither isEither(char c1, char c2) { 1876 return new CharMatcher.IsEither(c1, c2); 1877 } 1878 1879 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1880 private static final class IsEither extends FastMatcher { 1881 1882 private final char match1; 1883 private final char match2; 1884 1885 IsEither(char match1, char match2) { 1886 this.match1 = match1; 1887 this.match2 = match2; 1888 } 1889 1890 @Override 1891 public boolean matches(char c) { 1892 return c == match1 || c == match2; 1893 } 1894 1895 @GwtIncompatible // used only from other GwtIncompatible code 1896 @Override 1897 void setBits(BitSet table) { 1898 table.set(match1); 1899 table.set(match2); 1900 } 1901 1902 @Override 1903 public String toString() { 1904 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1905 } 1906 } 1907 1908 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1909 private static final class AnyOf extends CharMatcher { 1910 1911 private final char[] chars; 1912 1913 public AnyOf(CharSequence chars) { 1914 this.chars = chars.toString().toCharArray(); 1915 Arrays.sort(this.chars); 1916 } 1917 1918 @Override 1919 public boolean matches(char c) { 1920 return Arrays.binarySearch(chars, c) >= 0; 1921 } 1922 1923 @Override 1924 @GwtIncompatible // used only from other GwtIncompatible code 1925 void setBits(BitSet table) { 1926 for (char c : chars) { 1927 table.set(c); 1928 } 1929 } 1930 1931 @Override 1932 public String toString() { 1933 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1934 for (char c : chars) { 1935 description.append(showCharacter(c)); 1936 } 1937 description.append("\")"); 1938 return description.toString(); 1939 } 1940 } 1941 1942 /** Implementation of {@link #inRange(char, char)}. */ 1943 private static final class InRange extends FastMatcher { 1944 1945 private final char startInclusive; 1946 private final char endInclusive; 1947 1948 InRange(char startInclusive, char endInclusive) { 1949 checkArgument(endInclusive >= startInclusive); 1950 this.startInclusive = startInclusive; 1951 this.endInclusive = endInclusive; 1952 } 1953 1954 @Override 1955 public boolean matches(char c) { 1956 return startInclusive <= c && c <= endInclusive; 1957 } 1958 1959 @GwtIncompatible // used only from other GwtIncompatible code 1960 @Override 1961 void setBits(BitSet table) { 1962 table.set(startInclusive, endInclusive + 1); 1963 } 1964 1965 @Override 1966 public String toString() { 1967 return "CharMatcher.inRange('" 1968 + showCharacter(startInclusive) 1969 + "', '" 1970 + showCharacter(endInclusive) 1971 + "')"; 1972 } 1973 } 1974 1975 /** Implementation of {@link #forPredicate(Predicate)}. */ 1976 private static final class ForPredicate extends CharMatcher { 1977 1978 private final Predicate<? super Character> predicate; 1979 1980 ForPredicate(Predicate<? super Character> predicate) { 1981 this.predicate = checkNotNull(predicate); 1982 } 1983 1984 @Override 1985 public boolean matches(char c) { 1986 return predicate.apply(c); 1987 } 1988 1989 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1990 @Override 1991 public boolean apply(Character character) { 1992 return predicate.apply(checkNotNull(character)); 1993 } 1994 1995 @Override 1996 public String toString() { 1997 return "CharMatcher.forPredicate(" + predicate + ")"; 1998 } 1999 } 2000}