001/*
002 * ============================================================================
003 *  Copyright © 2002-2026 by Thomas Thrien.
004 *  All Rights Reserved.
005 * ============================================================================
006 *  Licensed to the public under the agreements of the GNU Lesser General Public
007 *  License, version 3.0 (the "License"). You may obtain a copy of the License at
008 *
009 *       http://www.gnu.org/licenses/lgpl.html
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
013 *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
014 *  License for the specific language governing permissions and limitations
015 *  under the License.
016 */
017
018package org.tquadrat.foundation.util;
019
020import org.apiguardian.api.API;
021import org.tquadrat.foundation.annotation.ClassVersion;
022import org.tquadrat.foundation.annotation.UtilityClass;
023import org.tquadrat.foundation.exception.PrivateConstructorForStaticClassCalledError;
024import org.tquadrat.foundation.exception.ValidationException;
025
026import java.text.Normalizer;
027import java.util.regex.Pattern;
028
029import static java.lang.Character.MIN_CODE_POINT;
030import static java.lang.Character.isISOControl;
031import static java.lang.Character.isSurrogatePair;
032import static java.lang.Character.toChars;
033import static java.lang.Character.toCodePoint;
034import static java.lang.Integer.min;
035import static java.lang.String.format;
036import static java.text.Normalizer.isNormalized;
037import static java.text.Normalizer.normalize;
038import static java.util.regex.Pattern.compile;
039import static java.util.stream.Collectors.joining;
040import static org.apiguardian.api.API.Status.STABLE;
041import static org.tquadrat.foundation.lang.CommonConstants.EMPTY_STRING;
042import static org.tquadrat.foundation.lang.Objects.isNull;
043import static org.tquadrat.foundation.lang.Objects.nonNull;
044import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument;
045import static org.tquadrat.foundation.util.StringUtils.breakString;
046import static org.tquadrat.foundation.util.StringUtils.isEmpty;
047
048/**
049 *  This class provides several utilities dealing with Strings in different
050 *  character sets/encodings.
051 *
052 *  @extauthor Thomas Thrien - thomas.thrien@tquadrat.org
053 *  @version CharSetUtils: HexUtils.java 747 2020-12-01 12:40:38Z tquadrat $
054 *
055 *  @UMLGraph.link
056 *  @since 0.1.0
057 */
058@SuppressWarnings( "MagicNumber" )
059@ClassVersion( sourceVersion = "$Id: CharSetUtils.java 1163 2026-03-20 15:28:33Z tquadrat $" )
060@API( status = STABLE, since = "0.1.0" )
061@UtilityClass
062public final class CharSetUtils
063{
064        /*--------------*\
065    ====** Constructors **=====================================================
066        \*--------------*/
067    /**
068     *  No instance allowed for this class!
069     */
070    private CharSetUtils() { throw new PrivateConstructorForStaticClassCalledError( CharSetUtils.class ); }
071
072        /*---------*\
073    ====** Methods **==========================================================
074        \*---------*/
075    /**
076     *  Converts the given byte array into to a String that will only contain
077     *  printable ASCII characters; all other characters will be 'escaped' to
078     *  the format &quot;<code>&#92;uXXXX</code>&quot;. This can be useful to
079     *  generate a String in another character set/encoding than ASCII or
080     *  UTF-8/Unicode, given that the receiving part can interpret the
081     *  format.<br>
082     *  <br>But generally, a transfer encoding like BASE64 or quoted-printable
083     *  should be preferred.
084     *
085     *  @param  bytes   The input; may be {@code null}.
086     *  @return The output string; {@code null} if the input was already
087     *      {@code null}.
088     *
089     *  @since 0.1.0
090     */
091    @API( status = STABLE, since = "0.1.0" )
092    public static final String convertBytesToASCII( final byte [] bytes )
093    {
094        String retValue = null;
095        if( nonNull( bytes ) )
096        {
097            if( bytes.length == 0 )
098            {
099                retValue = EMPTY_STRING;
100            }
101            else
102            {
103                final var buffer = new StringBuilder();
104                for( final var b : bytes )
105                {
106                    final var codePoint = (int) b;
107                    //noinspection NonStrictComparisonCanBeEquality
108                    buffer.append( (codePoint < ' ') || (codePoint >= 0x007F)
109                        ? escapeCharacter( codePoint )
110                        : Character.toString( codePoint ) );
111                }
112                retValue = buffer.toString();
113            }
114        }
115
116        //---* Done *----------------------------------------------------------
117        return retValue;
118    }   //  convertBytesToASCII()
119
120    /**
121     *  Converts a String that contains only ASCII characters and Unicode
122     *  escape sequences like &quot;<code>&#92;uXXXX</code>&quot; to the
123     *  equivalent Unicode String.<br>
124     *  <br>This method will not touch other escape sequences, like
125     *  <code>&quot;&#92;n&quot;</code> or <code>&quot;&#92;t&quot;</code>.
126     *  Refer to
127     *  {@link String#translateEscapes()}.
128     *
129     *  @param  input   The input String; may be {@code null}.
130     *  @return The output string; {@code null} if the input string was
131     *      already {@code null}.
132     *  @throws IllegalArgumentException    The given input String contained at
133     *      least one non-ASCII character.
134     *
135     *  @since 0.1.0
136     */
137    @API( status = STABLE, since = "0.1.0" )
138    public static final String convertEscapedStringToUnicode( final CharSequence input ) throws IllegalArgumentException
139    {
140        String retValue = null;
141        if( nonNull( input ) )
142        {
143            if( isEmpty( input ) )
144            {
145                retValue = EMPTY_STRING;
146            }
147            else
148            {
149                final var pattern = compile( "\\\\u\\p{XDigit}{4}" );
150                var inputPos = 0;
151                final var inputLength = input.length();
152                final var buffer = new StringBuilder( inputLength );
153                ScanLoop: while( inputPos < inputLength )
154                {
155                    final var currentChar = input.charAt( inputPos );
156                    if( currentChar == '\\' )
157                    {
158                        //---* Is this an escape sequence? *-------------------
159                        inputPos += extractEscapeSequence( buffer, pattern, input.subSequence( inputPos, min( inputLength, inputPos + 12 ) ) );
160                        continue ScanLoop;
161                    }
162
163                    buffer.append( currentChar );
164                    ++inputPos;
165                }   //  ScanLoop:
166                retValue = buffer.toString();
167            }
168        }
169
170        //---* Done *----------------------------------------------------------
171        return retValue;
172    }   //  convertEscapedStringToUnicode()
173
174    /**
175     *  Applies the given normalisation to the given Unicode String and
176     *  translates it to a String that will only contain printable ASCII
177     *  characters; all other characters will be 'escaped' to the format
178     *  &quot;<code>&#92;uXXXX</code>&quot;.
179     *
180     *  @param  normalization   The normalisation form; in case it is
181     *      {@code null}, no normalisation will be performed.
182     *  @param  input   The input String; may be {@code null}.
183     *  @return The output String; {@code null} if the input String was
184     *      already {@code null}.
185     *
186     *  @since 0.1.0
187     */
188    @API( status = STABLE, since = "0.1.0" )
189    public static final String convertUnicodeToASCII( final Normalizer.Form normalization, final CharSequence input )
190    {
191        String retValue = null;
192        if( nonNull( input ) )
193        {
194            if( isEmpty( input ) )
195            {
196                retValue = EMPTY_STRING;
197            }
198            else
199            {
200                //---* Normalise the String *----------------------------------
201                final var sequence = isNull( normalization )
202                    ? input
203                    : isNormalized( input, normalization )
204                        ? input
205                        : normalize( input, normalization );
206
207                retValue = sequence.codePoints()
208                    .mapToObj( codePoint ->
209                        isPrintableASCIICharacter( codePoint )
210                        ? Character.toString( codePoint )
211                        : escapeCharacter( codePoint ) )
212                    .collect( joining() );
213            }
214        }
215
216        //---* Done *----------------------------------------------------------
217        return retValue;
218    }   //  convertUnicodeToASCII()
219
220    /**
221     *  Translates the given Unicode String without any normalisation to a
222     *  String that will only contain printable ASCII characters; all other
223     *  characters will be 'escaped' to the format
224     *  &quot;<code>&#92;uXXXX</code>&quot;. Calling this method is the same as
225     *  calling
226     *  {@link #convertUnicodeToASCII(Normalizer.Form, CharSequence)}
227     *  with {@code null} as the first argument.
228     *
229     *  @param  input   The input String; may be {@code null}.
230     *  @return The output String; {@code null} if the input String was
231     *      already {@code null}.
232     *
233     *  @since 0.1.0
234     */
235    @API( status = STABLE, since = "0.1.0" )
236    public static final String convertUnicodeToASCII( final CharSequence input )
237    {
238        final var retValue = convertUnicodeToASCII( null, input );
239
240        //---* Done *----------------------------------------------------------
241        return retValue;
242    }   //  convertUnicodeToASCII()
243
244    /**
245     *  Returns the Unicode escape sequence for the given character. This will
246     *  return &quot;{@code &#92;u0075}&quot; for the letter 'u', and
247     *  &quot;{@code &#92;u003c}&quot; for the smaller-than sign '&lt;'.<br>
248     *  <br>This method should be used only for characters that are not
249     *  surrogates; for general use, the implementation that takes a code point
250     *  is preferred.
251     *
252     *  @param  c   The character.
253     *  @return The escape sequence.
254     *
255     *  @see #escapeCharacter(int)
256     *
257     *  @since 0.1.0
258     */
259    @API( status = STABLE, since = "0.1.0" )
260    public static final String escapeCharacter( final char c )
261    {
262        final var retValue = format( "\\u%04x", Integer.valueOf( c ) );
263
264        //---* Done *----------------------------------------------------------
265        return retValue;
266    }   //  escapeCharacter()
267
268    /**
269     *  <p>{@summary Returns the Unicode escape sequence for the given code
270     *  point.} This will return &quot;{@code &#92;u0075}&quot; for the letter 'u', and
271     *  &quot;{@code &#92;u003c}&quot; for the smaller-than sign '&lt;'.</p>
272     *  <p>This method takes only a single code point; to translate a whole
273     *  String, this code sequence can be used:</p>
274     *  <div class="source-container"><pre>&hellip;
275     *  String result = input.codePoints()
276     *      .mapToObj( codePoint -&gt; escapeUnicode( codePoint ) )
277     *      .collect( Collectors.joining() );
278     *  &hellip;</pre></div>
279     *  <p>This will escape <i>all</i> characters in the String. If only a subset
280     *  needs to be escaped, the mapping function in
281     *  {@link java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) mapToObj()}
282     *  can be adjusted accordingly. Something like that is implemented with
283     *  the method
284     *  {@link #convertUnicodeToASCII(CharSequence)}.</p>
285     *
286     *  @param  codePoint   The character.
287     *  @return The escape sequence.
288     *  @throws IllegalArgumentException    The given code point is invalid.
289     *
290     *  @see String#codePoints()
291     *  @see java.util.stream.IntStream#mapToObj(java.util.function.IntFunction)
292     *  @see java.util.stream.Stream#collect(java.util.stream.Collector)
293     *  @see java.util.stream.Collectors#joining()
294     *
295     *  @since 0.1.0
296     */
297    @API( status = STABLE, since = "0.1.0" )
298    public static final String escapeCharacter( final int codePoint ) throws IllegalArgumentException
299    {
300        final var retValue = new StringBuilder();
301        for( final var c : toChars( codePoint ) ) retValue.append( format( "\\u%04x", Integer.valueOf( c ) ) );
302
303        //---* Done *----------------------------------------------------------
304        return retValue.toString();
305    }   //  escapeCharacter()
306
307    /**
308     *  Returns {@code true} if the given character is an ASCII character.
309     *
310     *  @param  c   The character to check.
311     *  @return {@code true} if the given character is an ASCII character,
312     *      {@code false} otherwise.
313     */
314    public static final boolean isASCIICharacter( final char c )
315    {
316        return isASCIICharacter( (int) c );
317    }   //  isASCIICharacter()
318
319    /**
320     *  Returns {@code true} if the given code point represents an ASCII
321     *  character.
322     *
323     *  @param  codePoint   The code point to check.
324     *  @return {@code true} if the given code point represents an ASCII
325     *      character, {@code false} otherwise.
326     */
327    public static final boolean isASCIICharacter( final int codePoint )
328    {
329        final var retValue = (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80);
330
331        //---* Done *----------------------------------------------------------
332        return retValue;
333    }   //  isASCIICharacter()
334
335    /**
336     *  Returns {@code true} if the given character is a printable ASCII
337     *  character. That means, it is an ASCII character, but not a control
338     *  character.
339     *
340     *  @param  c   The character to check.
341     *  @return {@code true} if the given character is a printable ASCII
342     *      character, {@code false} otherwise.
343     */
344    public static final boolean isPrintableASCIICharacter( final char c )
345    {
346        return isPrintableASCIICharacter( (int) c );
347    }   //  isPrintableASCIICharacter()
348
349    /**
350     *  Returns {@code true} if the given code point represents a printable
351     *  ASCII character. That means, it is an ASCII character, but not a
352     *  control character.
353     *
354     *  @param  codePoint   The code point to check.
355     *  @return {@code true} if the given code point represents a printable
356     *      ASCII character, {@code false} otherwise.
357     */
358    public static final boolean isPrintableASCIICharacter( final int codePoint )
359    {
360        final var retValue = !isISOControl(codePoint) && (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80);
361
362        //---* Done *----------------------------------------------------------
363        return retValue;
364    }   //  isPrintableASCIICharacter()
365
366    /**
367     *  Extracts the escape sequence from the given chunk, write the result to
368     *  the buffer and returns the offset.
369     *
370     *  @param  buffer  The target buffer.
371     *  @param  pattern The regex pattern for the check.
372     *  @param chunk    The chunk to check.
373     *  @return The offset; one of 1, 6, or 12.
374     */
375    private static final int extractEscapeSequence( final StringBuilder buffer, final Pattern pattern, final CharSequence chunk )
376    {
377        var retValue = 1;
378        if( chunk.length() >= 6 )
379        {
380            final var c1 = chunk.subSequence( 0, 6 );
381            if( pattern.matcher( c1 ).matches() )
382            {
383                if( (chunk.length() == 12) && pattern.matcher( chunk.subSequence( 6, 12 ) ).matches() )
384                {
385                    try
386                    {
387                        buffer.append( unescapeUnicode( chunk ) );
388                        retValue = 12;
389                    }
390                    catch( final ValidationException ignored ) { /* Deliberately ignored */ }
391                }
392
393                if( retValue == 1 )
394                {
395                    try
396                    {
397                        buffer.append( unescapeUnicode( c1 ) );
398                        retValue = 6;
399                    }
400                    catch( final ValidationException ignored ) { /* Deliberately ignored */ }
401                }
402            }
403        }
404
405        if( retValue == 1 ) buffer.append( chunk.charAt( 0 ) );
406
407        //---* Done *----------------------------------------------------------
408        return retValue;
409    }   //  extractEscapeSequence()
410
411    /**
412     *  Parses Strings in the format &quot;<code>&#92;uXXXX</code>&quot;,
413     *  containing the textual representation of a single Unicode character, to
414     *  the respective Unicode character. Some Unicode characters will be
415     *  represented as <i>surrogate pairs</i> in Java, so the String that is
416     *  returned by this method may contain more than one {@code char}.<br>
417     *  <br>The input format for this method is used in Java source code
418     *  Strings, in Java {@code .properties} files, in C/C++ source code, in
419     *  JavaScript source, &hellip;
420     *
421     *  @param  input   The input String with the Unicode escape sequence.
422     *  @return The Unicode character.
423     *  @throws ValidationException The input is {@code null}, empty, or cannot
424     *      be parsed as a Unicode escape sequence.
425     *
426     *  @since 0.1.5
427     */
428    @API( status = STABLE, since = "0.1.5" )
429    public static final String unescapeUnicode( final CharSequence input )
430    {
431        final var len = requireNotEmptyArgument( input, "input" ).length();
432        //noinspection MagicNumber
433        if( (len != 6) && (len != 12) ) throw new ValidationException( "The length of a Unicode String must be 6 or 12 characters" );
434        if( !input.subSequence( 0, 2 ).equals( "\\u" ) ) throw new ValidationException( "Unicode String must start with '\\u'" );
435
436        final var msgCannotparse = "Cannot parse '%s' as a Unicode Escape String";
437        @SuppressWarnings( "NumericCastThatLosesPrecision" )
438        final var characters = breakString( input, 6 )
439            .mapToInt( chunk ->
440            {
441                try
442                {
443                    return Integer.parseInt( chunk.subSequence( 2, 6 ).toString(), 0x10 );
444                }
445                catch( final NumberFormatException e )
446                {
447                    throw new ValidationException( format( msgCannotparse, input ), e );
448                }
449            } )
450            .mapToObj( i -> Character.valueOf( (char) i ) )
451            .toArray( Character []::new );
452
453        final var codePoint = switch( characters.length )
454            {
455                case 1 -> characters [0];
456                case 2 ->
457                    {
458                        if( !isSurrogatePair( characters [0], characters [1] ) )
459                        {
460                            throw new ValidationException( format( msgCannotparse, input ) );
461                        }
462                        yield toCodePoint( characters [0], characters [1] );
463                    }
464                default -> throw new ValidationException( format( msgCannotparse, input ) );
465            };
466        if( !Character.isValidCodePoint( codePoint ) ) throw new ValidationException( format( msgCannotparse, input ) );
467        final var retValue = new String( toChars( codePoint ) );
468
469        //---* Done *----------------------------------------------------------
470        return retValue;
471    }   //  unescapeUnicode()
472}
473//  class CharSetUtils
474
475/*
476 *  End of File
477 */