001/*
002 * ============================================================================
003 * Copyright © 2002-2023 by Thomas Thrien.
004 * All Rights Reserved.
005 * ============================================================================
006 * Licensed to the public under the agreements of the GNU Lesser General Public
007 * License, version 3.0 (the "License"). You may obtain a copy of the License at
008 *
009 *      http://www.gnu.org/licenses/lgpl.html
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
013 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
014 * License for the specific language governing permissions and limitations
015 * under the License.
016 */
017
018package org.tquadrat.foundation.util.internal;
019
020import static java.lang.String.format;
021import static java.util.Arrays.stream;
022import static java.util.stream.Collectors.joining;
023import static java.util.stream.Stream.builder;
024import static org.apiguardian.api.API.Status.INTERNAL;
025import static org.tquadrat.foundation.lang.CommonConstants.UTF8;
026import static org.tquadrat.foundation.lang.Objects.isNull;
027import static org.tquadrat.foundation.lang.Objects.nonNull;
028import static org.tquadrat.foundation.lang.Objects.requireNonNullArgument;
029import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument;
030import static org.tquadrat.foundation.util.StringUtils.isEmptyOrBlank;
031import static org.tquadrat.foundation.util.StringUtils.isNotEmptyOrBlank;
032
033import java.io.BufferedReader;
034import java.io.IOException;
035import java.io.InputStreamReader;
036import java.net.URL;
037import java.util.Map;
038import java.util.Optional;
039import java.util.TreeMap;
040import java.util.function.Supplier;
041import java.util.stream.Stream;
042import java.util.stream.Stream.Builder;
043
044import org.apiguardian.api.API;
045import org.tquadrat.foundation.annotation.ClassVersion;
046import org.tquadrat.foundation.exception.UnexpectedExceptionError;
047import org.tquadrat.foundation.lang.Lazy;
048import org.tquadrat.foundation.util.StringUtils;
049
050/**
051 *  Provides HTML and XML entity utilities.
052 *
053 *  @extauthor Thomas Thrien - thomas.thrien@tquadrat.org
054 *  @thanks Alexander Day Chaffee - alex@purpletech.com
055 *  @thanks Gary Gregory - ggregory@seagullsw.com
056 *  @inspired Some code I found somewhere long time ago, originally written by
057 *      Alexander Day Chaffee and Gary Gregory
058 *  @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $
059 *  @since 0.0.5
060 *
061 *  @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
062 *  @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
063 *  @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
064 *  @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
065 *  @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
066 *  @see <a href="https://www.quackit.com/character_sets/html5_entities/html5_entities_all.cfm">HTML5 Entities in Alphabetical Order - Complete List</a>
067 *
068 *  @UMLGraph.link
069 */
070@ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" )
071@API( status = INTERNAL, since = "0.0.5" )
072public final class Entities
073{
074        /*---------------*\
075    ====** Inner Classes **====================================================
076        \*---------------*/
077    /**
078     *  Local interface for the data structure that is used to store the
079     *  entity mappings.
080     *
081     *  @extauthor Alexander Day Chaffee - alex@purpletech.com
082     *  @extauthor Gary Gregory - ggregory@seagullsw.com
083     *  @extauthor Thomas Thrien - thomas.thrien@tquadrat.org
084     *  @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $
085     *  @since 0.0.5
086     *
087     *  @UMLGraph.link
088     */
089    @ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" )
090    private static interface EntityMap
091    {
092            /*---------*\
093        ====** Methods **======================================================
094            \*---------*/
095        /**
096         *  Adds an entry to this entity map.<br>
097         *  <br>If the value is negative, only the name to value relation will
098         *  be stored.
099         *
100         *  @param  name    The entity name.
101         *  @param  value   The entity value (the Unicode code).
102         */
103        public void add( String name, Integer value );
104
105        /**
106         *  Returns the entities.
107         *
108         *  @return The entities.
109         */
110        public Stream<String> list();
111
112        /**
113         *  Returns the name of the entity identified by the specified value.
114         *
115         *  @param  value   The value to locate.
116         *  @return An instance of
117         *      {@link Optional}
118         *      that holds the entity name that is associated with the
119         *      specified value.
120         */
121        public Optional<String> name( int value );
122
123        /**
124         *  Returns the value of the entity identified by the specified name.
125         *
126         *  @param  name    The name of the entity to locate
127         *  @return An instance of
128         *      {@link Optional}
129         *      that holds the entity value associated with the specified name.
130         */
131        public Optional<Integer> value( String name );
132    }
133    //  interface EntityMap
134
135    /**
136     *  A simple implementation for the interface
137     *  {@link EntityMap}.
138     *
139     *  @extauthor Alexander Day Chaffee - alex@purpletech.com
140     *  @extauthor Gary Gregory - ggregory@seagullsw.com
141     *  @extauthor Thomas Thrien - thomas.thrien@tquadrat.org
142     *  @version $Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $
143     *  @since 0.0.5
144     *
145     *  @UMLGraph.link
146     */
147    @ClassVersion( sourceVersion = "$Id: Entities.java 1060 2023-09-24 19:21:40Z tquadrat $" )
148    private static class PrimitiveEntityMap implements EntityMap
149    {
150            /*------------*\
151        ====** Attributes **===================================================
152            \*------------*/
153        /**
154         *  The map that holds the references from the name to the value.
155         */
156        private final Map<String,Integer> m_NameToValue = new TreeMap<>();
157
158        /**
159         *  The map that holds the references from the value to the name.
160         */
161        private final Map<Integer,String> m_ValueToName = new TreeMap<>();
162
163            /*--------------*\
164        ====** Constructors **=================================================
165            \*--------------*/
166        /**
167         *  Creates a new primitive entity map.
168         */
169        public PrimitiveEntityMap() { /* Does nothing but exist */ }
170
171            /*---------*\
172        ====** Methods **======================================================
173            \*---------*/
174        /**
175         * {@inheritDoc}
176         */
177        @Override
178        public final void add( final String name, final Integer value )
179        {
180            assert isNotEmptyOrBlank( name ) : "name is empty or null";
181            assert nonNull( value ) : "value is null";
182            assert value.intValue() != 0 : "value is 0";
183
184            if( value.intValue() > 0 )
185            {
186                m_NameToValue.put( name, value );
187                final var previousName = m_ValueToName.put( value, name );
188                assert isNull( previousName ) : "Duplicate: %s, %s, %d".formatted( name, previousName, value );
189            }
190            else
191            {
192                m_NameToValue.put( name, Integer.valueOf( -value.intValue() ) );
193            }
194        }   //  add()
195
196        /**
197         * {@inheritDoc}
198         */
199        @Override
200        public final Stream<String> list() { return m_NameToValue.keySet().stream(); }
201
202        /**
203         * {@inheritDoc}
204         */
205        @Override
206        public final Optional<String> name( final int value ) { return Optional.ofNullable( m_ValueToName.get( value ) ); }
207
208        /**
209         * {@inheritDoc}
210         */
211        @Override
212        public final Optional<Integer> value( final String name )
213        {
214            final var retValue = Optional.ofNullable( m_NameToValue.get( requireNotEmptyArgument( name, "name" ) ) );
215
216            //---* Done *------------------------------------------------------
217            return retValue;
218        }   //  value()
219    }
220    //  class PrimitiveEntityMap
221
222        /*-----------*\
223    ====** Constants **========================================================
224        \*-----------*/
225    /**
226     *  The name for the resource file that holds the additional XML entities:
227     *  {@value}. These entities are not defined for HTML before HTML&nbsp;5.
228     */
229    public static final String ADDITIONAL_XML_ENTITIES = "apos_entities.data";
230
231    /**
232     *  The name for the resource file that holds the basic entities that are
233     *  common for both XML and HTML: {@value}.
234     */
235    public static final String BASIC_ENTITIES = "basic_entities.data";
236
237    /**
238     *  The name for the resource final that holds the entities that were
239     *  introduced for HTML&nbsp;3.2: {@value}.
240     */
241    public static final String HTML32_ENTITIES = "ISO8859_1_entities.data";
242
243    /**
244     *  The name for the resource final that holds the entities that were
245     *  introduced for HTML&nbsp;4.0: {@value}.
246     */
247    public static final String HTML40_ENTITIES = "html40_entities.data";
248
249    /**
250     *  The name for the resource final that holds the entities that were
251     *  introduced for HTML&nbsp;5.0: {@value}.
252     */
253    public static final String HTML50_ENTITIES = "html50_entities.data";
254
255        /*------------*\
256    ====** Attributes **=======================================================
257        \*------------*/
258    /**
259     *  The entity mapping.
260     */
261    private final Lazy<EntityMap> m_EntityMap;
262
263        /*------------------------*\
264    ====** Static Initialisations **===========================================
265        \*------------------------*/
266    /**
267     *  The set of basic entities.
268     */
269    public static final Entities BASIC;
270
271    /**
272     *  The set of entities supported by HTML 3.2.
273     */
274    public static final Entities HTML32;
275
276    /**
277     *  The set of entities supported by HTML 4.0.
278     */
279    public static final Entities HTML40;
280
281    /**
282     *  The set of entities supported by HTML 5.0.
283     */
284    public static final Entities HTML50;
285
286    /**
287     *  The set of entities supported by standard XML.
288     */
289    public static final Entities XML;
290
291    static
292    {
293        //---* Set the BASIC entities *----------------------------------------
294        BASIC = new Entities( BASIC_ENTITIES );
295
296        //---* Set the HTML 3.2 entities *-------------------------------------
297        HTML32 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES );
298
299        //---* Set the HTML 4.0 entities *-------------------------------------
300        HTML40 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES, HTML40_ENTITIES );
301
302        //---* Set the HTML 5.0 entities *-------------------------------------
303        HTML50 = new Entities( BASIC_ENTITIES, HTML32_ENTITIES, HTML40_ENTITIES, HTML50_ENTITIES );
304
305        //---* Set the HTML XML entities *-------------------------------------
306        XML = new Entities( BASIC_ENTITIES, ADDITIONAL_XML_ENTITIES );
307    }
308
309        /*--------------*\
310    ====** Constructors **=====================================================
311        \*--------------*/
312    /**
313     *  Creates a new {@code Entities} instance.
314     *
315     *  @param  resourceNames   The names of the resource files with the entity
316     *      definitions.
317     */
318    private Entities( final String... resourceNames )
319    {
320        final var thisClass = getClass();
321        final var packageName = thisClass.getPackageName().replace( '.', '/' );
322        final var resourceURLs = stream( resourceNames )
323            .map( name ->
324            {
325                final var resourceName = format( "/%s/%s", packageName, name );
326                final var resourceURL = thisClass.getResource( resourceName );
327                assert nonNull( resourceURL ) : "URL is null for %s".formatted( resourceName );
328                return resourceURL;
329            } )
330            .toArray( URL []::new );
331
332        final var supplier = (Supplier<EntityMap>) () ->
333        {
334            final EntityMap map = new PrimitiveEntityMap();
335            for( final var resourceURL : resourceURLs )
336            {
337                loadEntities( map, resourceURL );
338            }
339            return map;
340        };
341
342        m_EntityMap = Lazy.use( supplier );
343    }   //  Entities()
344
345        /*---------*\
346    ====** Methods **==========================================================
347        \*---------*/
348    /**
349     *  Underlying unescape method that allows the optimisation of not starting
350     *  from the 0 index again.
351     *
352     *  @param  buffer  The buffer to write the results to.
353     *  @param  source  The source {@code String} to unescape.
354     *  @param  firstAmp    The index of the first ampersand in the source.
355     *  @throws IOException Problems on writing to the {@code buffer}.
356     */
357    @SuppressWarnings( {"MagicNumber", "OverlyNestedMethod", "OverlyComplexMethod"} )
358    private void doUnescape( final Appendable buffer, final CharSequence source, final int firstAmp ) throws IOException
359    {
360        assert nonNull( buffer ) : "buffer is null";
361        assert nonNull( source ) : "source is null";
362        assert firstAmp >= 0 : "firstAmp is less than 0";
363
364        buffer.append( source, 0, firstAmp );
365        final var str = source.toString();
366        final var len = str.length();
367        char isHexChar;
368        int nextIndex;
369        int semiColonIndex;
370        int ampersandIndex;
371        int entityContentLen;
372        Optional<Integer> entityValue;
373        ScanLoop: for( var i = firstAmp; i < len; ++i )
374        {
375            final var currentCharacter = str.charAt( i );
376            if( currentCharacter == '&' )
377            {
378                nextIndex = i + 1;
379                semiColonIndex = str.indexOf( ';', nextIndex );
380                if( semiColonIndex == -1 )
381                {
382                    buffer.append( '&' );
383                    continue ScanLoop;
384                }
385                ampersandIndex = str.indexOf( '&', i + 1 );
386                if( (ampersandIndex != -1) && (ampersandIndex < semiColonIndex) )
387                {
388                    //---* The text looks like "&...&...;" *-------------------
389                    buffer.append( '&' );
390                    continue ScanLoop;
391                }
392                final var entityContent = str.substring( nextIndex, semiColonIndex );
393                entityValue = Optional.empty();
394                entityContentLen = entityContent.length();
395                if( entityContentLen > 0 )
396                {
397                    if( entityContent.charAt( 0 ) == '#' )
398                    {
399                        /*
400                         * Escaped value content is an integer (decimal or
401                         * hexadecimal)
402                         */
403                        if( entityContentLen > 1 )
404                        {
405                            isHexChar = entityContent.charAt( 1 );
406                            try
407                            {
408                                final var value = switch( isHexChar )
409                                {
410                                    case 'X', 'x' -> Integer.parseInt( entityContent.substring( 2 ), 0x10 );
411                                    default -> Integer.parseInt( entityContent.substring( 1 ), 10 );
412                                };
413                                entityValue = value > 0xFFFFFF ? Optional.empty() : Optional.of( Integer.valueOf( value ) );
414                            }
415                            catch( final NumberFormatException ignored )
416                            {
417                                entityValue = Optional.empty();
418                            }
419                        }
420                    }
421                    else
422                    {
423                        //---* Escaped value content is an entity name *-------
424                        entityValue = entityValue( entityContent );
425                    }
426                }
427                buffer.append( entityValue.map( v -> Character.toString( v.intValue() ) ).orElseGet( () -> format( "&%s;", entityContent ) ) );
428
429                //---* Move  the index up to the semi-colon *------------------
430                //noinspection AssignmentToForLoopParameter
431                i = semiColonIndex;
432            }
433            else
434            {
435                buffer.append( currentCharacter );
436            }
437        }   //  ScanLoop:
438    }   //  doUnescape()
439
440    /**
441     *  Returns the name of the entity identified by the specified value.
442     *
443     *  @param  value   The value to locate.
444     *  @return An instance of
445     *      {@link Optional}
446     *      that holds the entity name that is associated with the specified
447     *      value.
448     */
449    public final Optional<String> entityName( final int value ) { return m_EntityMap.get().name( value ); }
450
451    /**
452     *  Returns the value of the entity identified by the specified name.
453     *
454     *  @param  name    The name to locate.
455     *  @return An instance of
456     *      {@link Optional}
457     *      that holds the entity value associated with the specified name.
458     */
459    public final Optional<Integer> entityValue( final String name ) { return m_EntityMap.get().value( name ); }
460
461    /**
462     *  Escapes the characters in a {@code String}.<br>
463     *  <br>For example, if you have called
464     *  {@code addEntity( "foo", "0xA1" )}, a call to
465     *  {@code escape( "\u00A1" )} will return {@code "&foo;"}.
466     *
467     *  @param  source  The {@code String} to escape.
468     *  @return A new escaped {@code String}.
469     */
470    @SuppressWarnings( "UnnecessaryUnicodeEscape" )
471    public final String escape( final CharSequence source )
472    {
473        @SuppressWarnings( "NumericCastThatLosesPrecision" )
474        final var retValue = requireNonNullArgument( source, "source" )
475            .codePoints()
476            .mapToObj( codePoint -> entityName( codePoint ).map( name -> format( "&%s;", name ) )
477                .orElseGet( () -> codePoint > 0x7F ? formatCodePoint( codePoint ) : Character.toString( (char) codePoint ) ) )
478            .collect( joining() );
479
480        //---* Done *----------------------------------------------------------
481        return retValue;
482    }   //  escape()
483
484    /**
485     *  Escapes the characters in the {@code String} passed and writes the
486     *  result to the
487     *  {@link Appendable}
488     *  passed.
489     *
490     *  @param  appendable  The {@code Appendable} to write the results of the
491     *      escaping to.
492     *  @param  input   The {@code String} to escape.
493     *  @throws IOException when {@code Appendable} passed throws the exception
494     *      from calls to the
495     *      {@link Appendable#append(char)}
496     *      method.
497     *  @see #escape(CharSequence)
498     */
499    public final void escape( final Appendable appendable, final CharSequence input ) throws IOException
500    {
501        requireNonNullArgument( appendable, "appendable" ).append( escape( requireNonNullArgument( input, "input" ) ) );
502    }   //  escape()
503
504    /**
505     *  Converts a code point into the numerical HTML escape format.
506     *
507     *  @param  codePoint   The code point.
508     *  @return The HTML escaped code point.
509     */
510    private static final String formatCodePoint( final int codePoint )
511    {
512        final Builder<String> builder = builder();
513        for( final var c : Character.toChars( codePoint ) )
514        {
515            builder.add( format( "&#x%X;", (int) c ) );
516        }
517        final var retValue = builder.build().collect( joining() );
518
519        //---* Done *----------------------------------------------------------
520        return retValue;
521    }   //  formatCodePoint()
522
523    /**
524     *  Returns a list of all known entities.
525     *
526     *  @return An array of String with the entities, there numerical values
527     *      and the Unicode name of the entity.
528     */
529    public final String [] listEntities()
530    {
531        final var retValue = m_EntityMap.get().list()
532            .sorted()
533            .map( e ->
534            {
535                /*
536                 * For all existing entities, entityValue() will return a
537                 * value, so that the check on presence is obsolete.
538                 */
539                @SuppressWarnings( "OptionalGetWithoutIsPresent" )
540                final var value = entityValue( e ).get().intValue();
541                final var unicode = Character.getName( value );
542                return "&%1$s; = &#%2$d = &#%2$X%3$s".formatted( e, value, isEmptyOrBlank( unicode ) ? "" : format( " (%s)", unicode ) );
543            } )
544            .toArray( String []::new );
545
546        //---* Done *----------------------------------------------------------
547        return retValue;
548    }   //  listEntities()
549
550    /**
551     *  Load the entities from the resource identified by the given URL to the
552     *  given target entity map.
553     *
554     *  @param  entityMap   The map that is the target for the entities.
555     *  @param  resourceURL The URL for the resource.
556     */
557    @SuppressWarnings( "ProhibitedExceptionThrown" )
558    private static void loadEntities( final EntityMap entityMap, final URL resourceURL )
559    {
560        assert nonNull( entityMap ) : "entityMap is null";
561        assert nonNull( resourceURL ) : "resourceURL is null";
562
563        try( final var reader = new BufferedReader( new InputStreamReader( resourceURL.openStream(), UTF8 ) ) )
564        {
565            reader.lines()
566                .filter( StringUtils::isNotEmptyOrBlank )
567                .filter( line -> !line.startsWith( "#" ) )
568                .forEach( line -> parseAndAdd( entityMap, line ) );
569        }
570        catch( final IOException e )
571        {
572            throw new Error( "Failed to read resource " + resourceURL, e );
573        }
574    }   //  loadEntities()
575
576    /**
577     *  Parses the given input line for an entity name and the related code
578     *  point, and adds both to the given entity map.
579     *
580     *  @param  entityMap   The map that is the target for the entities.
581     *  @param  inputLine   The input line.
582     */
583    @SuppressWarnings( "ProhibitedExceptionThrown" )
584    private static void parseAndAdd( final EntityMap entityMap, final String inputLine )
585    {
586        //---* Strip the comment *---------------------------------------------
587        var pos = inputLine.indexOf( "#" );
588        final var data = (pos < 0 ? inputLine : inputLine.substring( 0, pos )).trim();
589
590        //---* Split the data *------------------------------------------------
591        pos = data.indexOf( "=" );
592        if( pos < 0 ) throw new Error( "Invalid input data: " + inputLine );
593        final var entityName = data.substring( 0, pos ).trim();
594        final var value = data.substring( pos + 1 ).trim();
595        try
596        {
597            final var codePoint = Integer.valueOf( value );
598            entityMap.add( entityName, codePoint );
599        }
600        catch( final NumberFormatException e )
601        {
602            throw new Error( "Invalid input data: " + inputLine, e );
603        }
604    }   //  parseAndAdd()
605
606    /**
607     *  <p>{@summary Unescapes the entities in a {@code String}.}</p>
608     *  <p>For example, if you have called {@code addEntity( "foo", 0xA1 )},
609     *  a call to {@code unescape( "&foo;")} will return {@code "\u00A1"}.</p>
610     *
611     *  @param  input   The {@code String} to escape.
612     *  @return A new escaped {@code String}.
613     */
614    @SuppressWarnings( "UnnecessaryUnicodeEscape" )
615    public final String unescape( final CharSequence input )
616    {
617        var retValue = requireNonNullArgument( input, "input" ).toString();
618        final var firstAmp = retValue.indexOf( '&' );
619        if( firstAmp >= 0 )
620        {
621            final var buffer = new StringBuilder( input.length() * 2 );
622            try
623            {
624                doUnescape( buffer, input, firstAmp );
625            }
626            catch( final IOException e )
627            {
628                /*
629                 * Operations on a StringBuilder should not cause an
630                 * IOException.
631                 */
632                throw new UnexpectedExceptionError( e );
633            }
634            retValue = buffer.toString();
635        }
636
637        //---* Done *----------------------------------------------------------
638        return retValue;
639    }   //  unescape()
640
641    /**
642     *  Unescapes the escaped entities in the {@code String} passed and writes
643     *  the result to the
644     *  {@link Appendable}
645     *  passed.
646     *
647     *  @param  appendable  The {@code Appendable} to write the results to.
648     *  @param  input   The source {@code String} to unescape.
649     *  @throws IOException when {@code Appendable} passed throws the exception
650     *      from calls to the
651     *      {@link Appendable#append(char)}
652     *      method.
653     *  @see #unescape(CharSequence)
654     */
655    public final void unescape( final Appendable appendable, final CharSequence input ) throws IOException
656    {
657        final var firstAmp = requireNonNullArgument( input, "input" ).toString().indexOf( "&" );
658        if( firstAmp >= 0 )
659        {
660            doUnescape( requireNonNullArgument( appendable, "appendable" ), input, firstAmp );
661        }
662        else
663        {
664            requireNonNullArgument( appendable, "appendable" ).append( input );
665        }
666    }   //  unescape()
667}
668//  class Entities
669
670/*
671 *  End of File
672 */