001/* 002 * ============================================================================ 003 * Copyright © 2002-2026 by Thomas Thrien. 004 * All Rights Reserved. 005 * ============================================================================ 006 * Licensed to the public under the agreements of the GNU Lesser General Public 007 * License, version 3.0 (the "License"). You may obtain a copy of the License at 008 * 009 * http://www.gnu.org/licenses/lgpl.html 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 013 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 014 * License for the specific language governing permissions and limitations 015 * under the License. 016 */ 017 018package org.tquadrat.foundation.util; 019 020import org.apiguardian.api.API; 021import org.tquadrat.foundation.annotation.ClassVersion; 022import org.tquadrat.foundation.annotation.UtilityClass; 023import org.tquadrat.foundation.exception.PrivateConstructorForStaticClassCalledError; 024import org.tquadrat.foundation.exception.ValidationException; 025 026import java.text.Normalizer; 027import java.util.regex.Pattern; 028 029import static java.lang.Character.MIN_CODE_POINT; 030import static java.lang.Character.isISOControl; 031import static java.lang.Character.isSurrogatePair; 032import static java.lang.Character.toChars; 033import static java.lang.Character.toCodePoint; 034import static java.lang.Integer.min; 035import static java.lang.String.format; 036import static java.text.Normalizer.isNormalized; 037import static java.text.Normalizer.normalize; 038import static java.util.regex.Pattern.compile; 039import static java.util.stream.Collectors.joining; 040import static org.apiguardian.api.API.Status.STABLE; 041import static org.tquadrat.foundation.lang.CommonConstants.EMPTY_STRING; 042import static org.tquadrat.foundation.lang.Objects.isNull; 043import static org.tquadrat.foundation.lang.Objects.nonNull; 044import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument; 045import static org.tquadrat.foundation.util.StringUtils.breakString; 046import static org.tquadrat.foundation.util.StringUtils.isEmpty; 047 048/** 049 * This class provides several utilities dealing with Strings in different 050 * character sets/encodings. 051 * 052 * @extauthor Thomas Thrien - thomas.thrien@tquadrat.org 053 * @version CharSetUtils: HexUtils.java 747 2020-12-01 12:40:38Z tquadrat $ 054 * 055 * @UMLGraph.link 056 * @since 0.1.0 057 */ 058@SuppressWarnings( "MagicNumber" ) 059@ClassVersion( sourceVersion = "$Id: CharSetUtils.java 1163 2026-03-20 15:28:33Z tquadrat $" ) 060@API( status = STABLE, since = "0.1.0" ) 061@UtilityClass 062public final class CharSetUtils 063{ 064 /*--------------*\ 065 ====** Constructors **===================================================== 066 \*--------------*/ 067 /** 068 * No instance allowed for this class! 069 */ 070 private CharSetUtils() { throw new PrivateConstructorForStaticClassCalledError( CharSetUtils.class ); } 071 072 /*---------*\ 073 ====** Methods **========================================================== 074 \*---------*/ 075 /** 076 * Converts the given byte array into to a String that will only contain 077 * printable ASCII characters; all other characters will be 'escaped' to 078 * the format "<code>\uXXXX</code>". This can be useful to 079 * generate a String in another character set/encoding than ASCII or 080 * UTF-8/Unicode, given that the receiving part can interpret the 081 * format.<br> 082 * <br>But generally, a transfer encoding like BASE64 or quoted-printable 083 * should be preferred. 084 * 085 * @param bytes The input; may be {@code null}. 086 * @return The output string; {@code null} if the input was already 087 * {@code null}. 088 * 089 * @since 0.1.0 090 */ 091 @API( status = STABLE, since = "0.1.0" ) 092 public static final String convertBytesToASCII( final byte [] bytes ) 093 { 094 String retValue = null; 095 if( nonNull( bytes ) ) 096 { 097 if( bytes.length == 0 ) 098 { 099 retValue = EMPTY_STRING; 100 } 101 else 102 { 103 final var buffer = new StringBuilder(); 104 for( final var b : bytes ) 105 { 106 final var codePoint = (int) b; 107 //noinspection NonStrictComparisonCanBeEquality 108 buffer.append( (codePoint < ' ') || (codePoint >= 0x007F) 109 ? escapeCharacter( codePoint ) 110 : Character.toString( codePoint ) ); 111 } 112 retValue = buffer.toString(); 113 } 114 } 115 116 //---* Done *---------------------------------------------------------- 117 return retValue; 118 } // convertBytesToASCII() 119 120 /** 121 * Converts a String that contains only ASCII characters and Unicode 122 * escape sequences like "<code>\uXXXX</code>" to the 123 * equivalent Unicode String.<br> 124 * <br>This method will not touch other escape sequences, like 125 * <code>"\n"</code> or <code>"\t"</code>. 126 * Refer to 127 * {@link String#translateEscapes()}. 128 * 129 * @param input The input String; may be {@code null}. 130 * @return The output string; {@code null} if the input string was 131 * already {@code null}. 132 * @throws IllegalArgumentException The given input String contained at 133 * least one non-ASCII character. 134 * 135 * @since 0.1.0 136 */ 137 @API( status = STABLE, since = "0.1.0" ) 138 public static final String convertEscapedStringToUnicode( final CharSequence input ) throws IllegalArgumentException 139 { 140 String retValue = null; 141 if( nonNull( input ) ) 142 { 143 if( isEmpty( input ) ) 144 { 145 retValue = EMPTY_STRING; 146 } 147 else 148 { 149 final var pattern = compile( "\\\\u\\p{XDigit}{4}" ); 150 var inputPos = 0; 151 final var inputLength = input.length(); 152 final var buffer = new StringBuilder( inputLength ); 153 ScanLoop: while( inputPos < inputLength ) 154 { 155 final var currentChar = input.charAt( inputPos ); 156 if( currentChar == '\\' ) 157 { 158 //---* Is this an escape sequence? *------------------- 159 inputPos += extractEscapeSequence( buffer, pattern, input.subSequence( inputPos, min( inputLength, inputPos + 12 ) ) ); 160 continue ScanLoop; 161 } 162 163 buffer.append( currentChar ); 164 ++inputPos; 165 } // ScanLoop: 166 retValue = buffer.toString(); 167 } 168 } 169 170 //---* Done *---------------------------------------------------------- 171 return retValue; 172 } // convertEscapedStringToUnicode() 173 174 /** 175 * Applies the given normalisation to the given Unicode String and 176 * translates it to a String that will only contain printable ASCII 177 * characters; all other characters will be 'escaped' to the format 178 * "<code>\uXXXX</code>". 179 * 180 * @param normalization The normalisation form; in case it is 181 * {@code null}, no normalisation will be performed. 182 * @param input The input String; may be {@code null}. 183 * @return The output String; {@code null} if the input String was 184 * already {@code null}. 185 * 186 * @since 0.1.0 187 */ 188 @API( status = STABLE, since = "0.1.0" ) 189 public static final String convertUnicodeToASCII( final Normalizer.Form normalization, final CharSequence input ) 190 { 191 String retValue = null; 192 if( nonNull( input ) ) 193 { 194 if( isEmpty( input ) ) 195 { 196 retValue = EMPTY_STRING; 197 } 198 else 199 { 200 //---* Normalise the String *---------------------------------- 201 final var sequence = isNull( normalization ) 202 ? input 203 : isNormalized( input, normalization ) 204 ? input 205 : normalize( input, normalization ); 206 207 retValue = sequence.codePoints() 208 .mapToObj( codePoint -> 209 isPrintableASCIICharacter( codePoint ) 210 ? Character.toString( codePoint ) 211 : escapeCharacter( codePoint ) ) 212 .collect( joining() ); 213 } 214 } 215 216 //---* Done *---------------------------------------------------------- 217 return retValue; 218 } // convertUnicodeToASCII() 219 220 /** 221 * Translates the given Unicode String without any normalisation to a 222 * String that will only contain printable ASCII characters; all other 223 * characters will be 'escaped' to the format 224 * "<code>\uXXXX</code>". Calling this method is the same as 225 * calling 226 * {@link #convertUnicodeToASCII(Normalizer.Form, CharSequence)} 227 * with {@code null} as the first argument. 228 * 229 * @param input The input String; may be {@code null}. 230 * @return The output String; {@code null} if the input String was 231 * already {@code null}. 232 * 233 * @since 0.1.0 234 */ 235 @API( status = STABLE, since = "0.1.0" ) 236 public static final String convertUnicodeToASCII( final CharSequence input ) 237 { 238 final var retValue = convertUnicodeToASCII( null, input ); 239 240 //---* Done *---------------------------------------------------------- 241 return retValue; 242 } // convertUnicodeToASCII() 243 244 /** 245 * Returns the Unicode escape sequence for the given character. This will 246 * return "{@code \u0075}" for the letter 'u', and 247 * "{@code \u003c}" for the smaller-than sign '<'.<br> 248 * <br>This method should be used only for characters that are not 249 * surrogates; for general use, the implementation that takes a code point 250 * is preferred. 251 * 252 * @param c The character. 253 * @return The escape sequence. 254 * 255 * @see #escapeCharacter(int) 256 * 257 * @since 0.1.0 258 */ 259 @API( status = STABLE, since = "0.1.0" ) 260 public static final String escapeCharacter( final char c ) 261 { 262 final var retValue = format( "\\u%04x", Integer.valueOf( c ) ); 263 264 //---* Done *---------------------------------------------------------- 265 return retValue; 266 } // escapeCharacter() 267 268 /** 269 * <p>{@summary Returns the Unicode escape sequence for the given code 270 * point.} This will return "{@code \u0075}" for the letter 'u', and 271 * "{@code \u003c}" for the smaller-than sign '<'.</p> 272 * <p>This method takes only a single code point; to translate a whole 273 * String, this code sequence can be used:</p> 274 * <div class="source-container"><pre>… 275 * String result = input.codePoints() 276 * .mapToObj( codePoint -> escapeUnicode( codePoint ) ) 277 * .collect( Collectors.joining() ); 278 * …</pre></div> 279 * <p>This will escape <i>all</i> characters in the String. If only a subset 280 * needs to be escaped, the mapping function in 281 * {@link java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) mapToObj()} 282 * can be adjusted accordingly. Something like that is implemented with 283 * the method 284 * {@link #convertUnicodeToASCII(CharSequence)}.</p> 285 * 286 * @param codePoint The character. 287 * @return The escape sequence. 288 * @throws IllegalArgumentException The given code point is invalid. 289 * 290 * @see String#codePoints() 291 * @see java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) 292 * @see java.util.stream.Stream#collect(java.util.stream.Collector) 293 * @see java.util.stream.Collectors#joining() 294 * 295 * @since 0.1.0 296 */ 297 @API( status = STABLE, since = "0.1.0" ) 298 public static final String escapeCharacter( final int codePoint ) throws IllegalArgumentException 299 { 300 final var retValue = new StringBuilder(); 301 for( final var c : toChars( codePoint ) ) retValue.append( format( "\\u%04x", Integer.valueOf( c ) ) ); 302 303 //---* Done *---------------------------------------------------------- 304 return retValue.toString(); 305 } // escapeCharacter() 306 307 /** 308 * Returns {@code true} if the given character is an ASCII character. 309 * 310 * @param c The character to check. 311 * @return {@code true} if the given character is an ASCII character, 312 * {@code false} otherwise. 313 */ 314 public static final boolean isASCIICharacter( final char c ) 315 { 316 return isASCIICharacter( (int) c ); 317 } // isASCIICharacter() 318 319 /** 320 * Returns {@code true} if the given code point represents an ASCII 321 * character. 322 * 323 * @param codePoint The code point to check. 324 * @return {@code true} if the given code point represents an ASCII 325 * character, {@code false} otherwise. 326 */ 327 public static final boolean isASCIICharacter( final int codePoint ) 328 { 329 final var retValue = (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80); 330 331 //---* Done *---------------------------------------------------------- 332 return retValue; 333 } // isASCIICharacter() 334 335 /** 336 * Returns {@code true} if the given character is a printable ASCII 337 * character. That means, it is an ASCII character, but not a control 338 * character. 339 * 340 * @param c The character to check. 341 * @return {@code true} if the given character is a printable ASCII 342 * character, {@code false} otherwise. 343 */ 344 public static final boolean isPrintableASCIICharacter( final char c ) 345 { 346 return isPrintableASCIICharacter( (int) c ); 347 } // isPrintableASCIICharacter() 348 349 /** 350 * Returns {@code true} if the given code point represents a printable 351 * ASCII character. That means, it is an ASCII character, but not a 352 * control character. 353 * 354 * @param codePoint The code point to check. 355 * @return {@code true} if the given code point represents a printable 356 * ASCII character, {@code false} otherwise. 357 */ 358 public static final boolean isPrintableASCIICharacter( final int codePoint ) 359 { 360 final var retValue = !isISOControl(codePoint) && (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80); 361 362 //---* Done *---------------------------------------------------------- 363 return retValue; 364 } // isPrintableASCIICharacter() 365 366 /** 367 * Extracts the escape sequence from the given chunk, write the result to 368 * the buffer and returns the offset. 369 * 370 * @param buffer The target buffer. 371 * @param pattern The regex pattern for the check. 372 * @param chunk The chunk to check. 373 * @return The offset; one of 1, 6, or 12. 374 */ 375 private static final int extractEscapeSequence( final StringBuilder buffer, final Pattern pattern, final CharSequence chunk ) 376 { 377 var retValue = 1; 378 if( chunk.length() >= 6 ) 379 { 380 final var c1 = chunk.subSequence( 0, 6 ); 381 if( pattern.matcher( c1 ).matches() ) 382 { 383 if( (chunk.length() == 12) && pattern.matcher( chunk.subSequence( 6, 12 ) ).matches() ) 384 { 385 try 386 { 387 buffer.append( unescapeUnicode( chunk ) ); 388 retValue = 12; 389 } 390 catch( final ValidationException ignored ) { /* Deliberately ignored */ } 391 } 392 393 if( retValue == 1 ) 394 { 395 try 396 { 397 buffer.append( unescapeUnicode( c1 ) ); 398 retValue = 6; 399 } 400 catch( final ValidationException ignored ) { /* Deliberately ignored */ } 401 } 402 } 403 } 404 405 if( retValue == 1 ) buffer.append( chunk.charAt( 0 ) ); 406 407 //---* Done *---------------------------------------------------------- 408 return retValue; 409 } // extractEscapeSequence() 410 411 /** 412 * Parses Strings in the format "<code>\uXXXX</code>", 413 * containing the textual representation of a single Unicode character, to 414 * the respective Unicode character. Some Unicode characters will be 415 * represented as <i>surrogate pairs</i> in Java, so the String that is 416 * returned by this method may contain more than one {@code char}.<br> 417 * <br>The input format for this method is used in Java source code 418 * Strings, in Java {@code .properties} files, in C/C++ source code, in 419 * JavaScript source, … 420 * 421 * @param input The input String with the Unicode escape sequence. 422 * @return The Unicode character. 423 * @throws ValidationException The input is {@code null}, empty, or cannot 424 * be parsed as a Unicode escape sequence. 425 * 426 * @since 0.1.5 427 */ 428 @API( status = STABLE, since = "0.1.5" ) 429 public static final String unescapeUnicode( final CharSequence input ) 430 { 431 final var len = requireNotEmptyArgument( input, "input" ).length(); 432 //noinspection MagicNumber 433 if( (len != 6) && (len != 12) ) throw new ValidationException( "The length of a Unicode String must be 6 or 12 characters" ); 434 if( !input.subSequence( 0, 2 ).equals( "\\u" ) ) throw new ValidationException( "Unicode String must start with '\\u'" ); 435 436 final var msgCannotparse = "Cannot parse '%s' as a Unicode Escape String"; 437 @SuppressWarnings( "NumericCastThatLosesPrecision" ) 438 final var characters = breakString( input, 6 ) 439 .mapToInt( chunk -> 440 { 441 try 442 { 443 return Integer.parseInt( chunk.subSequence( 2, 6 ).toString(), 0x10 ); 444 } 445 catch( final NumberFormatException e ) 446 { 447 throw new ValidationException( format( msgCannotparse, input ), e ); 448 } 449 } ) 450 .mapToObj( i -> Character.valueOf( (char) i ) ) 451 .toArray( Character []::new ); 452 453 final var codePoint = switch( characters.length ) 454 { 455 case 1 -> characters [0]; 456 case 2 -> 457 { 458 if( !isSurrogatePair( characters [0], characters [1] ) ) 459 { 460 throw new ValidationException( format( msgCannotparse, input ) ); 461 } 462 yield toCodePoint( characters [0], characters [1] ); 463 } 464 default -> throw new ValidationException( format( msgCannotparse, input ) ); 465 }; 466 if( !Character.isValidCodePoint( codePoint ) ) throw new ValidationException( format( msgCannotparse, input ) ); 467 final var retValue = new String( toChars( codePoint ) ); 468 469 //---* Done *---------------------------------------------------------- 470 return retValue; 471 } // unescapeUnicode() 472} 473// class CharSetUtils 474 475/* 476 * End of File 477 */