Source code

001/*
002 * ============================================================================
003 *  Copyright © 2002-2023 by Thomas Thrien.
004 *  All Rights Reserved.
005 * ============================================================================
006 *  Licensed to the public under the agreements of the GNU Lesser General Public
007 *  License, version 3.0 (the "License"). You may obtain a copy of the License at
008 *
009 *       http://www.gnu.org/licenses/lgpl.html
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
013 *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
014 *  License for the specific language governing permissions and limitations
015 *  under the License.
016 */
017
018package org.tquadrat.foundation.util;
019
020import static java.lang.Character.MIN_CODE_POINT;
021import static java.lang.Character.isISOControl;
022import static java.lang.Character.isSurrogatePair;
023import static java.lang.Character.toChars;
024import static java.lang.Character.toCodePoint;
025import static java.lang.Integer.min;
026import static java.lang.String.format;
027import static java.text.Normalizer.isNormalized;
028import static java.text.Normalizer.normalize;
029import static java.util.regex.Pattern.compile;
030import static java.util.stream.Collectors.joining;
031import static org.apiguardian.api.API.Status.STABLE;
032import static org.tquadrat.foundation.lang.CommonConstants.EMPTY_STRING;
033import static org.tquadrat.foundation.lang.Objects.isNull;
034import static org.tquadrat.foundation.lang.Objects.nonNull;
035import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument;
036import static org.tquadrat.foundation.util.StringUtils.breakString;
037import static org.tquadrat.foundation.util.StringUtils.isEmpty;
038
039import java.text.Normalizer;
040import java.util.regex.Pattern;
041
042import org.apiguardian.api.API;
043import org.tquadrat.foundation.annotation.ClassVersion;
044import org.tquadrat.foundation.annotation.UtilityClass;
045import org.tquadrat.foundation.exception.PrivateConstructorForStaticClassCalledError;
046import org.tquadrat.foundation.exception.ValidationException;
047
048/**
049 *  This class provides several utilities dealing with Strings in different
050 *  character sets/encodings.
051 *
052 *  @extauthor Thomas Thrien - thomas.thrien@tquadrat.org
053 *  @version CharSetUtils: HexUtils.java 747 2020-12-01 12:40:38Z tquadrat $
054 *
055 *  @UMLGraph.link
056 *  @since 0.1.0
057 */
058@SuppressWarnings( "MagicNumber" )
059@ClassVersion( sourceVersion = "$Id: CharSetUtils.java 1060 2023-09-24 19:21:40Z tquadrat $" )
060@API( status = STABLE, since = "0.1.0" )
061@UtilityClass
062public final class CharSetUtils
063{
064        /*--------------*\
065    ====** Constructors **=====================================================
066        \*--------------*/
067    /**
068     *  No instance allowed for this class!
069     */
070    private CharSetUtils() { throw new PrivateConstructorForStaticClassCalledError( CharSetUtils.class ); }
071
072        /*---------*\
073    ====** Methods **==========================================================
074        \*---------*/
075    /**
076     *  Converts the given byte array into to a String that will only contain
077     *  printable ASCII characters; all other characters will be 'escaped' to
078     *  the format &quot;<code>&#92;uXXXX</code>&quot;. This can be useful to
079     *  generate a String in another character set/encoding than ASCII or
080     *  UTF-8/Unicode, given that the receiving part can interpret the
081     *  format.<br>
082     *  <br>But generally, a transfer encoding like BASE64 or quoted-printable
083     *  should be preferred.
084     *
085     *  @param  bytes   The input; may be {@code null}.
086     *  @return The output string; {@code null} if the input was already
087     *      {@code null}.
088     *
089     *  @since 0.1.0
090     */
091    @API( status = STABLE, since = "0.1.0" )
092    public static final String convertBytesToASCII( final byte [] bytes )
093    {
094        String retValue = null;
095        if( nonNull( bytes ) )
096        {
097            if( bytes.length == 0 )
098            {
099                retValue = EMPTY_STRING;
100            }
101            else
102            {
103                final var buffer = new StringBuilder();
104                for( final var b : bytes )
105                {
106                    final var codePoint = (int) b;
107                    //noinspection NonStrictComparisonCanBeEquality
108                    buffer.append( (codePoint < ' ') || (codePoint >= 0x007F)
109                        ? escapeCharacter( codePoint )
110                        : Character.toString( codePoint ) );
111                }
112                retValue = buffer.toString();
113            }
114        }
115
116        //---* Done *----------------------------------------------------------
117        return retValue;
118    }   //  convertBytesToASCII()
119
120    /**
121     *  Converts a String that contains only ASCII characters and Unicode
122     *  escape sequences like &quot;<code>&#92;uXXXX</code>&quot; to the
123     *  equivalent Unicode String.<br>
124     *  <br>This method will not touch other escape sequences, like
125     *  <code>&quot;&#92;n&quot;</code> or <code>&quot;&#92;t&quot;</code>.
126     *  Refer to
127     *  {@link String#translateEscapes()}.
128     *
129     *  @param  input   The input String; may be {@code null}.
130     *  @return The output string; {@code null} if the input string was
131     *      already {@code null}.
132     *  @throws IllegalArgumentException    The given input String contained at
133     *      least one non-ASCII character.
134     *
135     *  @since 0.1.0
136     */
137    @API( status = STABLE, since = "0.1.0" )
138    public static final String convertEscapedStringToUnicode( final CharSequence input ) throws IllegalArgumentException
139    {
140        String retValue = null;
141        if( nonNull( input ) )
142        {
143            if( isEmpty( input ) )
144            {
145                retValue = EMPTY_STRING;
146            }
147            else
148            {
149                final var pattern = compile( "\\\\u\\p{XDigit}{4}" );
150                var inputPos = 0;
151                final var inputLength = input.length();
152                final var buffer = new StringBuilder( inputLength );
153                ScanLoop: while( inputPos < inputLength )
154                {
155                    final var currentChar = input.charAt( inputPos );
156                    if( currentChar == '\\' )
157                    {
158                        //---* Is this an escape sequence? *-------------------
159                        inputPos += extractEscapeSequence( buffer, pattern, input.subSequence( inputPos, min( inputLength, inputPos + 12 ) ) );
160                        continue ScanLoop;
161                    }
162
163                    buffer.append( currentChar );
164                    ++inputPos;
165                }   //  ScanLoop:
166                retValue = buffer.toString();
167            }
168        }
169
170        //---* Done *----------------------------------------------------------
171        return retValue;
172    }   //  convertEscapedStringToUnicode()
173
174    /**
175     *  Applies the given normalisation to the given Unicode String and
176     *  translates it to a String that will only contain printable ASCII
177     *  characters; all other characters will be 'escaped' to the format
178     *  &quot;<code>&#92;uXXXX</code>&quot;.
179     *
180     *  @param  normalization   The normalisation form; in case it is
181     *      {@code null}, no normalisation will be performed.
182     *  @param  input   The input String; may be {@code null}.
183     *  @return The output String; {@code null} if the input String was
184     *      already {@code null}.
185     *
186     *  @since 0.1.0
187     */
188    @API( status = STABLE, since = "0.1.0" )
189    public static final String convertUnicodeToASCII( final Normalizer.Form normalization, final CharSequence input )
190    {
191        String retValue = null;
192        if( nonNull( input ) )
193        {
194            if( isEmpty( input ) )
195            {
196                retValue = EMPTY_STRING;
197            }
198            else
199            {
200                //---* Normalise the String *----------------------------------
201                final var sequence = isNull( normalization )
202                    ? input
203                    : isNormalized( input, normalization )
204                        ? input
205                        : normalize( input, normalization );
206
207                retValue = sequence.codePoints()
208                    .mapToObj( codePoint ->
209                        isPrintableASCIICharacter( codePoint )
210                        ? Character.toString( codePoint )
211                        : escapeCharacter( codePoint ) )
212                    .collect( joining() );
213            }
214        }
215
216        //---* Done *----------------------------------------------------------
217        return retValue;
218    }   //  convertUnicodeToASCII()
219
220    /**
221     *  Translates the given Unicode String without any normalisation to a
222     *  String that will only contain printable ASCII characters; all other
223     *  characters will be 'escaped' to the format
224     *  &quot;<code>&#92;uXXXX</code>&quot;. Calling this method is the same as
225     *  calling
226     *  {@link #convertUnicodeToASCII(Normalizer.Form, CharSequence)}
227     *  with {@code null} as the first argument.
228     *
229     *  @param  input   The input String; may be {@code null}.
230     *  @return The output String; {@code null} if the input String was
231     *      already {@code null}.
232     *
233     *  @since 0.1.0
234     */
235    @API( status = STABLE, since = "0.1.0" )
236    public static final String convertUnicodeToASCII( final CharSequence input )
237    {
238        final var retValue = convertUnicodeToASCII( null, input );
239
240        //---* Done *----------------------------------------------------------
241        return retValue;
242    }   //  convertUnicodeToASCII()
243
244    /**
245     *  Returns the Unicode escape sequence for the given character. This will
246     *  return &quot;{@code &#92;u0075}&quot; for the letter 'u', and
247     *  &quot;{@code &#92;u003c}&quot; for the smaller-than sign '&lt;'.<br>
248     *  <br>This method should be used only for characters that are not
249     *  surrogates; for general use, the implementation that takes a code point
250     *  is preferred.
251     *
252     *  @param  c   The character.
253     *  @return The escape sequence.
254     *
255     *  @see #escapeCharacter(int)
256     *
257     *  @since 0.1.0
258     */
259    @API( status = STABLE, since = "0.1.0" )
260    public static final String escapeCharacter( final char c )
261    {
262        final var retValue = format( "\\u%04x", Integer.valueOf( c ) );
263
264        //---* Done *----------------------------------------------------------
265        return retValue;
266    }   //  escapeCharacter()
267
268    /**
269     *  Returns the Unicode escape sequence for the given code point. This will
270     *  return &quot;{@code &#92;u0075}&quot; for the letter 'u', and
271     *  &quot;{@code &#92;u003c}&quot; for the smaller-than sign '&lt;'.<br>
272     *  <br>This method takes only a single code point; to translate a whole
273     *  String, this code sequence can be used:<pre><code>  &hellip;
274     *  String result = input.codePoints()
275     *      .mapToObj( codePoint -&gt; escapeUnicode( codePoint ) )
276     *      .collect( Collectors.joining() );
277     *  &hellip;</code></pre>
278     *  This will escape <i>all</i> characters in the String. If only a subset
279     *  needs to be escaped, the mapping function in
280     *  {@link java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) mapToObj()}
281     *  can be adjusted accordingly. Something like that is implemented with
282     *  the method
283     *  {@link #convertUnicodeToASCII(CharSequence)}.
284     *
285     *  @param  codePoint   The character.
286     *  @return The escape sequence.
287     *  @throws IllegalArgumentException    The given code point is invalid.
288     *
289     *  @see String#codePoints()
290     *  @see java.util.stream.IntStream#mapToObj(java.util.function.IntFunction)
291     *  @see java.util.stream.Stream#collect(java.util.stream.Collector)
292     *  @see java.util.stream.Collectors#joining()
293     *
294     *  @since 0.1.0
295     */
296    @API( status = STABLE, since = "0.1.0" )
297    public static final String escapeCharacter( final int codePoint ) throws IllegalArgumentException
298    {
299        final var retValue = new StringBuilder();
300        for( final var c : toChars( codePoint ) ) retValue.append( format( "\\u%04x", Integer.valueOf( c ) ) );
301
302        //---* Done *----------------------------------------------------------
303        return retValue.toString();
304    }   //  escapeCharacter()
305
306    /**
307     *  Returns {@code true} if the given character is an ASCII character.
308     *
309     *  @param  c   The character to check.
310     *  @return {@code true} if the given character is an ASCII character,
311     *      {@code false} otherwise.
312     */
313    public static final boolean isASCIICharacter( final char c )
314    {
315        return isASCIICharacter( (int) c );
316    }   //  isASCIICharacter()
317
318    /**
319     *  Returns {@code true} if the given code point represents an ASCII
320     *  character.
321     *
322     *  @param  codePoint   The code point to check.
323     *  @return {@code true} if the given code point represents an ASCII
324     *      character, {@code false} otherwise.
325     */
326    public static final boolean isASCIICharacter( final int codePoint )
327    {
328        final var retValue = (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80);
329
330        //---* Done *----------------------------------------------------------
331        return retValue;
332    }   //  isASCIICharacter()
333
334    /**
335     *  Returns {@code true} if the given character is a printable ASCII
336     *  character. That means, it is an ASCII character, but not a control
337     *  character.
338     *
339     *  @param  c   The character to check.
340     *  @return {@code true} if the given character is a printable ASCII
341     *      character, {@code false} otherwise.
342     */
343    public static final boolean isPrintableASCIICharacter( final char c )
344    {
345        return isPrintableASCIICharacter( (int) c );
346    }   //  isPrintableASCIICharacter()
347
348    /**
349     *  Returns {@code true} if the given code point represents a printable
350     *  ASCII character. That means, it is an ASCII character, but not a
351     *  control character.
352     *
353     *  @param  codePoint   The code point to check.
354     *  @return {@code true} if the given code point represents a printable
355     *      ASCII character, {@code false} otherwise.
356     */
357    public static final boolean isPrintableASCIICharacter( final int codePoint )
358    {
359        final var retValue = !isISOControl(codePoint) && (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80);
360
361        //---* Done *----------------------------------------------------------
362        return retValue;
363    }   //  isPrintableASCIICharacter()
364
365    /**
366     *  Extracts the escape sequence from the given chunk, write the result to
367     *  the buffer and returns the offset.
368     *
369     *  @param  buffer  The target buffer.
370     *  @param  pattern The regex pattern for the check.
371     *  @param chunk    The chunk to check.
372     *  @return The offset; one of 1, 6, or 12.
373     */
374    private static final int extractEscapeSequence( final StringBuilder buffer, final Pattern pattern, final CharSequence chunk )
375    {
376        var retValue = 1;
377        if( chunk.length() >= 6 )
378        {
379            final var c1 = chunk.subSequence( 0, 6 );
380            if( pattern.matcher( c1 ).matches() )
381            {
382                if( (chunk.length() == 12) && pattern.matcher( chunk.subSequence( 6, 12 ) ).matches() )
383                {
384                    try
385                    {
386                        buffer.append( unescapeUnicode( chunk ) );
387                        retValue = 12;
388                    }
389                    catch( final ValidationException ignored ) { /* Deliberately ignored */ }
390                }
391
392                if( retValue == 1 )
393                {
394                    try
395                    {
396                        buffer.append( unescapeUnicode( c1 ) );
397                        retValue = 6;
398                    }
399                    catch( final ValidationException ignored ) { /* Deliberately ignored */ }
400                }
401            }
402        }
403
404        if( retValue == 1 ) buffer.append( chunk.charAt( 0 ) );
405
406        //---* Done *----------------------------------------------------------
407        return retValue;
408    }   //  extractEscapeSequence()
409
410    /**
411     *  Parses Strings in the format &quot;<code>&#92;uXXXX</code>&quot;,
412     *  containing the textual representation of a single Unicode character, to
413     *  the respective Unicode character. Some Unicode characters will be
414     *  represented as <i>surrogate pairs</i> in Java, so the String that is
415     *  returned by this method may contain more than one {@code char}.<br>
416     *  <br>The input format for this method is used in Java source code
417     *  Strings, in Java {@code .properties} files, in C/C++ source code, in
418     *  JavaScript source, &hellip;
419     *
420     *  @param  input   The input String with the Unicode escape sequence.
421     *  @return The Unicode character.
422     *  @throws ValidationException The input is {@code null}, empty, or cannot
423     *      be parsed as a unicode escape sequence.
424     *
425     *  @since 0.1.5
426     */
427    @API( status = STABLE, since = "0.1.5" )
428    public static final String unescapeUnicode( final CharSequence input )
429    {
430        final var len = requireNotEmptyArgument( input, "input" ).length();
431        //noinspection MagicNumber
432        if( (len != 6) && (len != 12) ) throw new ValidationException( "The length of a Unicode String must be 6 or 12 characters" );
433        if( !input.subSequence( 0, 2 ).equals( "\\u" ) ) throw new ValidationException( "Unicode String must start with '\\u'" );
434
435        final var msgCannotparse = "Cannot parse '%s' as a Unicode Escape String";
436        @SuppressWarnings( "NumericCastThatLosesPrecision" )
437        final var characters = breakString( input, 6 )
438            .mapToInt( chunk ->
439            {
440                try
441                {
442                    return Integer.parseInt( chunk.subSequence( 2, 6 ).toString(), 0x10 );
443                }
444                catch( final NumberFormatException e )
445                {
446                    throw new ValidationException( format( msgCannotparse, input ), e );
447                }
448            } )
449            .mapToObj( i -> Character.valueOf( (char) i ) )
450            .toArray( Character []::new );
451
452        final var codePoint = switch( characters.length )
453            {
454                case 1 -> characters [0];
455                case 2 ->
456                    {
457                        if( !isSurrogatePair( characters [0], characters [1] ) )
458                        {
459                            throw new ValidationException( format( msgCannotparse, input ) );
460                        }
461                        yield toCodePoint( characters [0], characters [1] );
462                    }
463                default -> throw new ValidationException( format( msgCannotparse, input ) );
464            };
465        if( !Character.isValidCodePoint( codePoint ) ) throw new ValidationException( format( msgCannotparse, input ) );
466        final var retValue = new String( toChars( codePoint ) );
467
468        //---* Done *----------------------------------------------------------
469        return retValue;
470    }   //  unescapeUnicode()
471}
472//  class CharSetUtils
473
474/*
475 *  End of File
476 */