001/* 002 * ============================================================================ 003 * Copyright © 2002-2023 by Thomas Thrien. 004 * All Rights Reserved. 005 * ============================================================================ 006 * Licensed to the public under the agreements of the GNU Lesser General Public 007 * License, version 3.0 (the "License"). You may obtain a copy of the License at 008 * 009 * http://www.gnu.org/licenses/lgpl.html 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 013 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 014 * License for the specific language governing permissions and limitations 015 * under the License. 016 */ 017 018package org.tquadrat.foundation.util.internal; 019 020import static java.lang.String.format; 021import static java.util.Arrays.stream; 022import static java.util.stream.Collectors.joining; 023import static java.util.stream.Stream.builder; 024import static org.apiguardian.api.API.Status.INTERNAL; 025import static org.tquadrat.foundation.lang.CommonConstants.UTF8; 026import static org.tquadrat.foundation.lang.Objects.isNull; 027import static org.tquadrat.foundation.lang.Objects.nonNull; 028import static org.tquadrat.foundation.lang.Objects.requireNonNullArgument; 029import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument; 030import static org.tquadrat.foundation.util.StringUtils.isEmptyOrBlank; 031import static org.tquadrat.foundation.util.StringUtils.isNotEmptyOrBlank; 032 033import java.io.BufferedReader; 034import java.io.IOException; 035import java.io.InputStreamReader; 036import java.net.URL; 037import java.util.Map; 038import java.util.Optional; 039import java.util.TreeMap; 040import java.util.function.Supplier; 041import java.util.stream.Stream; 042import java.util.stream.Stream.Builder; 043 044import org.apiguardian.api.API; 045import org.tquadrat.foundation.annotation.ClassVersion; 046import org.tquadrat.foundation.exception.UnexpectedExceptionError; 047import org.tquadrat.foundation.lang.Lazy; 048import org.tquadrat.foundation.util.StringUtils; 049 050/** 051 * Provides HTML and XML entity utilities. 052 * 053 * @extauthor Thomas Thrien - thomas.thrien@tquadrat.org 054 * @thanks Alexander Day Chaffee - alex@purpletech.com 055 * @thanks Gary Gregory - ggregory@seagullsw.com 056 * @inspired Some code I found somewhere long time ago, originally written by 057 * Alexander Day Chaffee and Gary Gregory 058 * @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $ 059 * @since 0.0.5 060 * 061 * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> 062 * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> 063 * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> 064 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> 065 * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> 066 * @see <a href="https://www.quackit.com/character_sets/html5_entities/html5_entities_all.cfm">HTML5 Entities in Alphabetical Order - Complete List</a> 067 * 068 * @UMLGraph.link 069 */ 070@ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" ) 071@API( status = INTERNAL, since = "0.0.5" ) 072public final class Entities 073{ 074 /*---------------*\ 075 ====** Inner Classes **==================================================== 076 \*---------------*/ 077 /** 078 * Local interface for the data structure that is used to store the 079 * entity mappings. 080 * 081 * @extauthor Alexander Day Chaffee - alex@purpletech.com 082 * @extauthor Gary Gregory - ggregory@seagullsw.com 083 * @extauthor Thomas Thrien - thomas.thrien@tquadrat.org 084 * @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $ 085 * @since 0.0.5 086 * 087 * @UMLGraph.link 088 */ 089 @ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" ) 090 private static interface EntityMap 091 { 092 /*---------*\ 093 ====** Methods **====================================================== 094 \*---------*/ 095 /** 096 * Adds an entry to this entity map.<br> 097 * <br>If the value is negative, only the name to value relation will 098 * be stored. 099 * 100 * @param name The entity name. 101 * @param value The entity value (the Unicode code). 102 */ 103 public void add( String name, Integer value ); 104 105 /** 106 * Returns the entities. 107 * 108 * @return The entities. 109 */ 110 public Stream<String> list(); 111 112 /** 113 * Returns the name of the entity identified by the specified value. 114 * 115 * @param value The value to locate. 116 * @return An instance of 117 * {@link Optional} 118 * that holds the entity name that is associated with the 119 * specified value. 120 */ 121 public Optional<String> name( int value ); 122 123 /** 124 * Returns the value of the entity identified by the specified name. 125 * 126 * @param name The name of the entity to locate 127 * @return An instance of 128 * {@link Optional} 129 * that holds the entity value associated with the specified name. 130 */ 131 public Optional<Integer> value( String name ); 132 } 133 // interface EntityMap 134 135 /** 136 * A simple implementation for the interface 137 * {@link EntityMap}. 138 * 139 * @extauthor Alexander Day Chaffee - alex@purpletech.com 140 * @extauthor Gary Gregory - ggregory@seagullsw.com 141 * @extauthor Thomas Thrien - thomas.thrien@tquadrat.org 142 * @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $ 143 * @since 0.0.5 144 * 145 * @UMLGraph.link 146 */ 147 @ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" ) 148 private static class PrimitiveEntityMap implements EntityMap 149 { 150 /*------------*\ 151 ====** Attributes **=================================================== 152 \*------------*/ 153 /** 154 * The map that holds the references from the name to the value. 155 */ 156 private final Map<String,Integer> m_NameToValue = new TreeMap<>(); 157 158 /** 159 * The map that holds the references from the value to the name. 160 */ 161 private final Map<Integer,String> m_ValueToName = new TreeMap<>(); 162 163 /*--------------*\ 164 ====** Constructors **================================================= 165 \*--------------*/ 166 /** 167 * Creates a new primitive entity map. 168 */ 169 public PrimitiveEntityMap() { /* Does nothing but exist */ } 170 171 /*---------*\ 172 ====** Methods **====================================================== 173 \*---------*/ 174 /** 175 * {@inheritDoc} 176 */ 177 @Override 178 public final void add( final String name, final Integer value ) 179 { 180 assert isNotEmptyOrBlank( name ) : "name is empty or null"; 181 assert nonNull( value ) : "value is null"; 182 assert value.intValue() != 0 : "value is 0"; 183 184 if( value.intValue() > 0 ) 185 { 186 m_NameToValue.put( name, value ); 187 final var previousName = m_ValueToName.put( value, name ); 188 assert isNull( previousName ) : "Duplicate: %s, %s, %d".formatted( name, previousName, value ); 189 } 190 else 191 { 192 m_NameToValue.put( name, Integer.valueOf( -value.intValue() ) ); 193 } 194 } // add() 195 196 /** 197 * {@inheritDoc} 198 */ 199 @Override 200 public final Stream<String> list() { return m_NameToValue.keySet().stream(); } 201 202 /** 203 * {@inheritDoc} 204 */ 205 @Override 206 public final Optional<String> name( final int value ) { return Optional.ofNullable( m_ValueToName.get( value ) ); } 207 208 /** 209 * {@inheritDoc} 210 */ 211 @Override 212 public final Optional<Integer> value( final String name ) 213 { 214 final var retValue = Optional.ofNullable( m_NameToValue.get( requireNotEmptyArgument( name, "name" ) ) ); 215 216 //---* Done *------------------------------------------------------ 217 return retValue; 218 } // value() 219 } 220 // class PrimitiveEntityMap 221 222 /*-----------*\ 223 ====** Constants **======================================================== 224 \*-----------*/ 225 /** 226 * The name for the resource file that holds the additional XML entities: 227 * {@value}. These entities are not defined for HTML before HTML 5. 228 */ 229 public static final String ADDITIONAL_XML_ENTITIES = "apos_entities.data"; 230 231 /** 232 * The name for the resource file that holds the basic entities that are 233 * common for both XML and HTML: {@value}. 234 */ 235 public static final String BASIC_ENTITIES = "basic_entities.data"; 236 237 /** 238 * The name for the resource final that holds the entities that were 239 * introduced for HTML 3.2: {@value}. 240 */ 241 public static final String HTML32_ENTITIES = "ISO8859_1_entities.data"; 242 243 /** 244 * The name for the resource final that holds the entities that were 245 * introduced for HTML 4.0: {@value}. 246 */ 247 public static final String HTML40_ENTITIES = "html40_entities.data"; 248 249 /** 250 * The name for the resource final that holds the entities that were 251 * introduced for HTML 5.0: {@value}. 252 */ 253 public static final String HTML50_ENTITIES = "html50_entities.data"; 254 255 /*------------*\ 256 ====** Attributes **======================================================= 257 \*------------*/ 258 /** 259 * The entity mapping. 260 */ 261 private final Lazy<EntityMap> m_EntityMap; 262 263 /*------------------------*\ 264 ====** Static Initialisations **=========================================== 265 \*------------------------*/ 266 /** 267 * The set of basic entities. 268 */ 269 public static final Entities BASIC; 270 271 /** 272 * The set of entities supported by HTML 3.2. 273 */ 274 public static final Entities HTML32; 275 276 /** 277 * The set of entities supported by HTML 4.0. 278 */ 279 public static final Entities HTML40; 280 281 /** 282 * The set of entities supported by HTML 5.0. 283 */ 284 public static final Entities HTML50; 285 286 /** 287 * The set of entities supported by standard XML. 288 */ 289 public static final Entities XML; 290 291 static 292 { 293 //---* Set the BASIC entities *---------------------------------------- 294 BASIC = new Entities( BASIC_ENTITIES ); 295 296 //---* Set the HTML 3.2 entities *------------------------------------- 297 HTML32 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES ); 298 299 //---* Set the HTML 4.0 entities *------------------------------------- 300 HTML40 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES, HTML40_ENTITIES ); 301 302 //---* Set the HTML 5.0 entities *------------------------------------- 303 HTML50 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES, HTML40_ENTITIES, HTML50_ENTITIES ); 304 305 //---* Set the HTML XML entities *------------------------------------- 306 XML = new Entities( BASIC_ENTITIES, ADDITIONAL_XML_ENTITIES ); 307 } 308 309 /*--------------*\ 310 ====** Constructors **===================================================== 311 \*--------------*/ 312 /** 313 * Creates a new {@code Entities} instance. 314 * 315 * @param resourceNames The names of the resource files with the entity 316 * definitions. 317 */ 318 private Entities( final String... resourceNames ) 319 { 320 final var thisClass = getClass(); 321 final var packageName = thisClass.getPackageName().replace( '.', '/' ); 322 final var resourceURLs = stream( resourceNames ) 323 .map( name -> 324 { 325 final var resourceName = format( "/%s/%s", packageName, name ); 326 final var resourceURL = thisClass.getResource( resourceName ); 327 assert nonNull( resourceURL ) : "URL is null for %s".formatted( resourceName ); 328 return resourceURL; 329 } ) 330 .toArray( URL []::new ); 331 332 final var supplier = (Supplier<EntityMap>) () -> 333 { 334 final EntityMap map = new PrimitiveEntityMap(); 335 for( final var resourceURL : resourceURLs ) 336 { 337 loadEntities( map, resourceURL ); 338 } 339 return map; 340 }; 341 342 m_EntityMap = Lazy.use( supplier ); 343 } // Entities() 344 345 /*---------*\ 346 ====** Methods **========================================================== 347 \*---------*/ 348 /** 349 * Underlying unescape method that allows the optimisation of not starting 350 * from the 0 index again. 351 * 352 * @param buffer The buffer to write the results to. 353 * @param source The source {@code String} to unescape. 354 * @param firstAmp The index of the first ampersand in the source. 355 * @throws IOException Problems on writing to the {@code buffer}. 356 */ 357 @SuppressWarnings( {"MagicNumber", "OverlyNestedMethod", "OverlyComplexMethod"} ) 358 private void doUnescape( final Appendable buffer, final CharSequence source, final int firstAmp ) throws IOException 359 { 360 assert nonNull( buffer ) : "buffer is null"; 361 assert nonNull( source ) : "source is null"; 362 assert firstAmp >= 0 : "firstAmp is less than 0"; 363 364 buffer.append( source, 0, firstAmp ); 365 final var str = source.toString(); 366 final var len = str.length(); 367 char isHexChar; 368 int nextIndex; 369 int semiColonIndex; 370 int ampersandIndex; 371 int entityContentLen; 372 Optional<Integer> entityValue; 373 ScanLoop: for( var i = firstAmp; i < len; ++i ) 374 { 375 final var currentCharacter = str.charAt( i ); 376 if( currentCharacter == '&' ) 377 { 378 nextIndex = i + 1; 379 semiColonIndex = str.indexOf( ';', nextIndex ); 380 if( semiColonIndex == -1 ) 381 { 382 buffer.append( '&' ); 383 continue ScanLoop; 384 } 385 ampersandIndex = str.indexOf( '&', i + 1 ); 386 if( (ampersandIndex != -1) && (ampersandIndex < semiColonIndex) ) 387 { 388 //---* The text looks like "&...&...;" *------------------- 389 buffer.append( '&' ); 390 continue ScanLoop; 391 } 392 final var entityContent = str.substring( nextIndex, semiColonIndex ); 393 entityValue = Optional.empty(); 394 entityContentLen = entityContent.length(); 395 if( entityContentLen > 0 ) 396 { 397 if( entityContent.charAt( 0 ) == '#' ) 398 { 399 /* 400 * Escaped value content is an integer (decimal or 401 * hexadecimal) 402 */ 403 if( entityContentLen > 1 ) 404 { 405 isHexChar = entityContent.charAt( 1 ); 406 try 407 { 408 final var value = switch( isHexChar ) 409 { 410 case 'X', 'x' -> Integer.parseInt( entityContent.substring( 2 ), 0x10 ); 411 default -> Integer.parseInt( entityContent.substring( 1 ), 10 ); 412 }; 413 entityValue = value > 0xFFFFFF ? Optional.empty() : Optional.of( Integer.valueOf( value ) ); 414 } 415 catch( final NumberFormatException ignored ) 416 { 417 entityValue = Optional.empty(); 418 } 419 } 420 } 421 else 422 { 423 //---* Escaped value content is an entity name *------- 424 entityValue = entityValue( entityContent ); 425 } 426 } 427 buffer.append( entityValue.map( v -> Character.toString( v.intValue() ) ).orElseGet( () -> format( "&%s;", entityContent ) ) ); 428 429 //---* Move the index up to the semi-colon *------------------ 430 //noinspection AssignmentToForLoopParameter 431 i = semiColonIndex; 432 } 433 else 434 { 435 buffer.append( currentCharacter ); 436 } 437 } // ScanLoop: 438 } // doUnescape() 439 440 /** 441 * Returns the name of the entity identified by the specified value. 442 * 443 * @param value The value to locate. 444 * @return An instance of 445 * {@link Optional} 446 * that holds the entity name that is associated with the specified 447 * value. 448 */ 449 public final Optional<String> entityName( final int value ) { return m_EntityMap.get().name( value ); } 450 451 /** 452 * Returns the value of the entity identified by the specified name. 453 * 454 * @param name The name to locate. 455 * @return An instance of 456 * {@link Optional} 457 * that holds the entity value associated with the specified name. 458 */ 459 public final Optional<Integer> entityValue( final String name ) { return m_EntityMap.get().value( name ); } 460 461 /** 462 * Escapes the characters in a {@code String}.<br> 463 * <br>For example, if you have called 464 * {@code addEntity( "foo", "0xA1" )}, a call to 465 * {@code escape( "\u00A1" )} will return {@code "&foo;"}. 466 * 467 * @param source The {@code String} to escape. 468 * @return A new escaped {@code String}. 469 */ 470 @SuppressWarnings( "UnnecessaryUnicodeEscape" ) 471 public final String escape( final CharSequence source ) 472 { 473 @SuppressWarnings( "NumericCastThatLosesPrecision" ) 474 final var retValue = requireNonNullArgument( source, "source" ) 475 .codePoints() 476 .mapToObj( codePoint -> entityName( codePoint ).map( name -> format( "&%s;", name ) ) 477 .orElseGet( () -> codePoint > 0x7F ? formatCodePoint( codePoint ) : Character.toString( (char) codePoint ) ) ) 478 .collect( joining() ); 479 480 //---* Done *---------------------------------------------------------- 481 return retValue; 482 } // escape() 483 484 /** 485 * Escapes the characters in the {@code String} passed and writes the 486 * result to the 487 * {@link Appendable} 488 * passed. 489 * 490 * @param appendable The {@code Appendable} to write the results of the 491 * escaping to. 492 * @param input The {@code String} to escape. 493 * @throws IOException when {@code Appendable} passed throws the exception 494 * from calls to the 495 * {@link Appendable#append(char)} 496 * method. 497 * @see #escape(CharSequence) 498 */ 499 public final void escape( final Appendable appendable, final CharSequence input ) throws IOException 500 { 501 requireNonNullArgument( appendable, "appendable" ).append( escape( requireNonNullArgument( input, "input" ) ) ); 502 } // escape() 503 504 /** 505 * Converts a code point into the numerical HTML escape format. 506 * 507 * @param codePoint The code point. 508 * @return The HTML escaped code point. 509 */ 510 private static final String formatCodePoint( final int codePoint ) 511 { 512 final Builder<String> builder = builder(); 513 for( final var c : Character.toChars( codePoint ) ) 514 { 515 builder.add( format( "&#x%X;", (int) c ) ); 516 } 517 final var retValue = builder.build().collect( joining() ); 518 519 //---* Done *---------------------------------------------------------- 520 return retValue; 521 } // formatCodePoint() 522 523 /** 524 * Returns a list of all known entities. 525 * 526 * @return An array of String with the entities, there numerical values 527 * and the Unicode name of the entity. 528 */ 529 public final String [] listEntities() 530 { 531 final var retValue = m_EntityMap.get().list() 532 .sorted() 533 .map( e -> 534 { 535 /* 536 * For all existing entities, entityValue() will return a 537 * value, so that the check on presence is obsolete. 538 */ 539 @SuppressWarnings( "OptionalGetWithoutIsPresent" ) 540 final var value = entityValue( e ).get().intValue(); 541 final var unicode = Character.getName( value ); 542 return "&%1$s; = &#%2$d = &#%2$X%3$s".formatted( e, value, isEmptyOrBlank( unicode ) ? "" : format( " (%s)", unicode ) ); 543 } ) 544 .toArray( String []::new ); 545 546 //---* Done *---------------------------------------------------------- 547 return retValue; 548 } // listEntities() 549 550 /** 551 * Load the entities from the resource identified by the given URL to the 552 * given target entity map. 553 * 554 * @param entityMap The map that is the target for the entities. 555 * @param resourceURL The URL for the resource. 556 */ 557 @SuppressWarnings( "ProhibitedExceptionThrown" ) 558 private static void loadEntities( final EntityMap entityMap, final URL resourceURL ) 559 { 560 assert nonNull( entityMap ) : "entityMap is null"; 561 assert nonNull( resourceURL ) : "resourceURL is null"; 562 563 try( final var reader = new BufferedReader( new InputStreamReader( resourceURL.openStream(), UTF8 ) ) ) 564 { 565 reader.lines() 566 .filter( StringUtils::isNotEmptyOrBlank ) 567 .filter( line -> !line.startsWith( "#" ) ) 568 .forEach( line -> parseAndAdd( entityMap, line ) ); 569 } 570 catch( final IOException e ) 571 { 572 throw new Error( "Failed to read resource " + resourceURL, e ); 573 } 574 } // loadEntities() 575 576 /** 577 * Parses the given input line for an entity name and the related code 578 * point, and adds both to the given entity map. 579 * 580 * @param entityMap The map that is the target for the entities. 581 * @param inputLine The input line. 582 */ 583 @SuppressWarnings( "ProhibitedExceptionThrown" ) 584 private static void parseAndAdd( final EntityMap entityMap, final String inputLine ) 585 { 586 //---* Strip the comment *--------------------------------------------- 587 var pos = inputLine.indexOf( "#" ); 588 final var data = (pos < 0 ? inputLine : inputLine.substring( 0, pos )).trim(); 589 590 //---* Split the data *------------------------------------------------ 591 pos = data.indexOf( "=" ); 592 if( pos < 0 ) throw new Error( "Invalid input data: " + inputLine ); 593 final var entityName = data.substring( 0, pos ).trim(); 594 final var value = data.substring( pos + 1 ).trim(); 595 try 596 { 597 final var codePoint = Integer.valueOf( value ); 598 entityMap.add( entityName, codePoint ); 599 } 600 catch( final NumberFormatException e ) 601 { 602 throw new Error( "Invalid input data: " + inputLine, e ); 603 } 604 } // parseAndAdd() 605 606 /** 607 * <p>{@summary Unescapes the entities in a {@code String}.}</p> 608 * <p>For example, if you have called {@code addEntity( "foo", 0xA1 )}, 609 * a call to {@code unescape( "&foo;")} will return {@code "\u00A1"}.</p> 610 * 611 * @param input The {@code String} to escape. 612 * @return A new escaped {@code String}. 613 */ 614 @SuppressWarnings( "UnnecessaryUnicodeEscape" ) 615 public final String unescape( final CharSequence input ) 616 { 617 var retValue = requireNonNullArgument( input, "input" ).toString(); 618 final var firstAmp = retValue.indexOf( '&' ); 619 if( firstAmp >= 0 ) 620 { 621 final var buffer = new StringBuilder( input.length() * 2 ); 622 try 623 { 624 doUnescape( buffer, input, firstAmp ); 625 } 626 catch( final IOException e ) 627 { 628 /* 629 * Operations on a StringBuilder should not cause an 630 * IOException. 631 */ 632 throw new UnexpectedExceptionError( e ); 633 } 634 retValue = buffer.toString(); 635 } 636 637 //---* Done *---------------------------------------------------------- 638 return retValue; 639 } // unescape() 640 641 /** 642 * Unescapes the escaped entities in the {@code String} passed and writes 643 * the result to the 644 * {@link Appendable} 645 * passed. 646 * 647 * @param appendable The {@code Appendable} to write the results to. 648 * @param input The source {@code String} to unescape. 649 * @throws IOException when {@code Appendable} passed throws the exception 650 * from calls to the 651 * {@link Appendable#append(char)} 652 * method. 653 * @see #unescape(CharSequence) 654 */ 655 public final void unescape( final Appendable appendable, final CharSequence input ) throws IOException 656 { 657 final var firstAmp = requireNonNullArgument( input, "input" ).toString().indexOf( "&" ); 658 if( firstAmp >= 0 ) 659 { 660 doUnescape( requireNonNullArgument( appendable, "appendable" ), input, firstAmp ); 661 } 662 else 663 { 664 requireNonNullArgument( appendable, "appendable" ).append( input ); 665 } 666 } // unescape() 667} 668// class Entities 669 670/* 671 * End of File 672 */