001/* 002 * ============================================================================ 003 * Copyright © 2002-2023 by Thomas Thrien. 004 * All Rights Reserved. 005 * ============================================================================ 006 * Licensed to the public under the agreements of the GNU Lesser General Public 007 * License, version 3.0 (the "License"). You may obtain a copy of the License at 008 * 009 * http://www.gnu.org/licenses/lgpl.html 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 013 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 014 * License for the specific language governing permissions and limitations 015 * under the License. 016 */ 017 018package org.tquadrat.foundation.util; 019 020import static java.lang.Character.MIN_CODE_POINT; 021import static java.lang.Character.isISOControl; 022import static java.lang.Character.isSurrogatePair; 023import static java.lang.Character.toChars; 024import static java.lang.Character.toCodePoint; 025import static java.lang.Integer.min; 026import static java.lang.String.format; 027import static java.text.Normalizer.isNormalized; 028import static java.text.Normalizer.normalize; 029import static java.util.regex.Pattern.compile; 030import static java.util.stream.Collectors.joining; 031import static org.apiguardian.api.API.Status.STABLE; 032import static org.tquadrat.foundation.lang.CommonConstants.EMPTY_STRING; 033import static org.tquadrat.foundation.lang.Objects.isNull; 034import static org.tquadrat.foundation.lang.Objects.nonNull; 035import static org.tquadrat.foundation.lang.Objects.requireNotEmptyArgument; 036import static org.tquadrat.foundation.util.StringUtils.breakString; 037import static org.tquadrat.foundation.util.StringUtils.isEmpty; 038 039import java.text.Normalizer; 040import java.util.regex.Pattern; 041 042import org.apiguardian.api.API; 043import org.tquadrat.foundation.annotation.ClassVersion; 044import org.tquadrat.foundation.annotation.UtilityClass; 045import org.tquadrat.foundation.exception.PrivateConstructorForStaticClassCalledError; 046import org.tquadrat.foundation.exception.ValidationException; 047 048/** 049 * This class provides several utilities dealing with Strings in different 050 * character sets/encodings. 051 * 052 * @extauthor Thomas Thrien - thomas.thrien@tquadrat.org 053 * @version CharSetUtils: HexUtils.java 747 2020-12-01 12:40:38Z tquadrat $ 054 * 055 * @UMLGraph.link 056 * @since 0.1.0 057 */ 058@SuppressWarnings( "MagicNumber" ) 059@ClassVersion( sourceVersion = "$Id: CharSetUtils.java 1060 2023-09-24 19:21:40Z tquadrat $" ) 060@API( status = STABLE, since = "0.1.0" ) 061@UtilityClass 062public final class CharSetUtils 063{ 064 /*--------------*\ 065 ====** Constructors **===================================================== 066 \*--------------*/ 067 /** 068 * No instance allowed for this class! 069 */ 070 private CharSetUtils() { throw new PrivateConstructorForStaticClassCalledError( CharSetUtils.class ); } 071 072 /*---------*\ 073 ====** Methods **========================================================== 074 \*---------*/ 075 /** 076 * Converts the given byte array into to a String that will only contain 077 * printable ASCII characters; all other characters will be 'escaped' to 078 * the format "<code>\uXXXX</code>". This can be useful to 079 * generate a String in another character set/encoding than ASCII or 080 * UTF-8/Unicode, given that the receiving part can interpret the 081 * format.<br> 082 * <br>But generally, a transfer encoding like BASE64 or quoted-printable 083 * should be preferred. 084 * 085 * @param bytes The input; may be {@code null}. 086 * @return The output string; {@code null} if the input was already 087 * {@code null}. 088 * 089 * @since 0.1.0 090 */ 091 @API( status = STABLE, since = "0.1.0" ) 092 public static final String convertBytesToASCII( final byte [] bytes ) 093 { 094 String retValue = null; 095 if( nonNull( bytes ) ) 096 { 097 if( bytes.length == 0 ) 098 { 099 retValue = EMPTY_STRING; 100 } 101 else 102 { 103 final var buffer = new StringBuilder(); 104 for( final var b : bytes ) 105 { 106 final var codePoint = (int) b; 107 //noinspection NonStrictComparisonCanBeEquality 108 buffer.append( (codePoint < ' ') || (codePoint >= 0x007F) 109 ? escapeCharacter( codePoint ) 110 : Character.toString( codePoint ) ); 111 } 112 retValue = buffer.toString(); 113 } 114 } 115 116 //---* Done *---------------------------------------------------------- 117 return retValue; 118 } // convertBytesToASCII() 119 120 /** 121 * Converts a String that contains only ASCII characters and Unicode 122 * escape sequences like "<code>\uXXXX</code>" to the 123 * equivalent Unicode String.<br> 124 * <br>This method will not touch other escape sequences, like 125 * <code>"\n"</code> or <code>"\t"</code>. 126 * Refer to 127 * {@link String#translateEscapes()}. 128 * 129 * @param input The input String; may be {@code null}. 130 * @return The output string; {@code null} if the input string was 131 * already {@code null}. 132 * @throws IllegalArgumentException The given input String contained at 133 * least one non-ASCII character. 134 * 135 * @since 0.1.0 136 */ 137 @API( status = STABLE, since = "0.1.0" ) 138 public static final String convertEscapedStringToUnicode( final CharSequence input ) throws IllegalArgumentException 139 { 140 String retValue = null; 141 if( nonNull( input ) ) 142 { 143 if( isEmpty( input ) ) 144 { 145 retValue = EMPTY_STRING; 146 } 147 else 148 { 149 final var pattern = compile( "\\\\u\\p{XDigit}{4}" ); 150 var inputPos = 0; 151 final var inputLength = input.length(); 152 final var buffer = new StringBuilder( inputLength ); 153 ScanLoop: while( inputPos < inputLength ) 154 { 155 final var currentChar = input.charAt( inputPos ); 156 if( currentChar == '\\' ) 157 { 158 //---* Is this an escape sequence? *------------------- 159 inputPos += extractEscapeSequence( buffer, pattern, input.subSequence( inputPos, min( inputLength, inputPos + 12 ) ) ); 160 continue ScanLoop; 161 } 162 163 buffer.append( currentChar ); 164 ++inputPos; 165 } // ScanLoop: 166 retValue = buffer.toString(); 167 } 168 } 169 170 //---* Done *---------------------------------------------------------- 171 return retValue; 172 } // convertEscapedStringToUnicode() 173 174 /** 175 * Applies the given normalisation to the given Unicode String and 176 * translates it to a String that will only contain printable ASCII 177 * characters; all other characters will be 'escaped' to the format 178 * "<code>\uXXXX</code>". 179 * 180 * @param normalization The normalisation form; in case it is 181 * {@code null}, no normalisation will be performed. 182 * @param input The input String; may be {@code null}. 183 * @return The output String; {@code null} if the input String was 184 * already {@code null}. 185 * 186 * @since 0.1.0 187 */ 188 @API( status = STABLE, since = "0.1.0" ) 189 public static final String convertUnicodeToASCII( final Normalizer.Form normalization, final CharSequence input ) 190 { 191 String retValue = null; 192 if( nonNull( input ) ) 193 { 194 if( isEmpty( input ) ) 195 { 196 retValue = EMPTY_STRING; 197 } 198 else 199 { 200 //---* Normalise the String *---------------------------------- 201 final var sequence = isNull( normalization ) 202 ? input 203 : isNormalized( input, normalization ) 204 ? input 205 : normalize( input, normalization ); 206 207 retValue = sequence.codePoints() 208 .mapToObj( codePoint -> 209 isPrintableASCIICharacter( codePoint ) 210 ? Character.toString( codePoint ) 211 : escapeCharacter( codePoint ) ) 212 .collect( joining() ); 213 } 214 } 215 216 //---* Done *---------------------------------------------------------- 217 return retValue; 218 } // convertUnicodeToASCII() 219 220 /** 221 * Translates the given Unicode String without any normalisation to a 222 * String that will only contain printable ASCII characters; all other 223 * characters will be 'escaped' to the format 224 * "<code>\uXXXX</code>". Calling this method is the same as 225 * calling 226 * {@link #convertUnicodeToASCII(Normalizer.Form, CharSequence)} 227 * with {@code null} as the first argument. 228 * 229 * @param input The input String; may be {@code null}. 230 * @return The output String; {@code null} if the input String was 231 * already {@code null}. 232 * 233 * @since 0.1.0 234 */ 235 @API( status = STABLE, since = "0.1.0" ) 236 public static final String convertUnicodeToASCII( final CharSequence input ) 237 { 238 final var retValue = convertUnicodeToASCII( null, input ); 239 240 //---* Done *---------------------------------------------------------- 241 return retValue; 242 } // convertUnicodeToASCII() 243 244 /** 245 * Returns the Unicode escape sequence for the given character. This will 246 * return "{@code \u0075}" for the letter 'u', and 247 * "{@code \u003c}" for the smaller-than sign '<'.<br> 248 * <br>This method should be used only for characters that are not 249 * surrogates; for general use, the implementation that takes a code point 250 * is preferred. 251 * 252 * @param c The character. 253 * @return The escape sequence. 254 * 255 * @see #escapeCharacter(int) 256 * 257 * @since 0.1.0 258 */ 259 @API( status = STABLE, since = "0.1.0" ) 260 public static final String escapeCharacter( final char c ) 261 { 262 final var retValue = format( "\\u%04x", Integer.valueOf( c ) ); 263 264 //---* Done *---------------------------------------------------------- 265 return retValue; 266 } // escapeCharacter() 267 268 /** 269 * Returns the Unicode escape sequence for the given code point. This will 270 * return "{@code \u0075}" for the letter 'u', and 271 * "{@code \u003c}" for the smaller-than sign '<'.<br> 272 * <br>This method takes only a single code point; to translate a whole 273 * String, this code sequence can be used:<pre><code> … 274 * String result = input.codePoints() 275 * .mapToObj( codePoint -> escapeUnicode( codePoint ) ) 276 * .collect( Collectors.joining() ); 277 * …</code></pre> 278 * This will escape <i>all</i> characters in the String. If only a subset 279 * needs to be escaped, the mapping function in 280 * {@link java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) mapToObj()} 281 * can be adjusted accordingly. Something like that is implemented with 282 * the method 283 * {@link #convertUnicodeToASCII(CharSequence)}. 284 * 285 * @param codePoint The character. 286 * @return The escape sequence. 287 * @throws IllegalArgumentException The given code point is invalid. 288 * 289 * @see String#codePoints() 290 * @see java.util.stream.IntStream#mapToObj(java.util.function.IntFunction) 291 * @see java.util.stream.Stream#collect(java.util.stream.Collector) 292 * @see java.util.stream.Collectors#joining() 293 * 294 * @since 0.1.0 295 */ 296 @API( status = STABLE, since = "0.1.0" ) 297 public static final String escapeCharacter( final int codePoint ) throws IllegalArgumentException 298 { 299 final var retValue = new StringBuilder(); 300 for( final var c : toChars( codePoint ) ) retValue.append( format( "\\u%04x", Integer.valueOf( c ) ) ); 301 302 //---* Done *---------------------------------------------------------- 303 return retValue.toString(); 304 } // escapeCharacter() 305 306 /** 307 * Returns {@code true} if the given character is an ASCII character. 308 * 309 * @param c The character to check. 310 * @return {@code true} if the given character is an ASCII character, 311 * {@code false} otherwise. 312 */ 313 public static final boolean isASCIICharacter( final char c ) 314 { 315 return isASCIICharacter( (int) c ); 316 } // isASCIICharacter() 317 318 /** 319 * Returns {@code true} if the given code point represents an ASCII 320 * character. 321 * 322 * @param codePoint The code point to check. 323 * @return {@code true} if the given code point represents an ASCII 324 * character, {@code false} otherwise. 325 */ 326 public static final boolean isASCIICharacter( final int codePoint ) 327 { 328 final var retValue = (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80); 329 330 //---* Done *---------------------------------------------------------- 331 return retValue; 332 } // isASCIICharacter() 333 334 /** 335 * Returns {@code true} if the given character is a printable ASCII 336 * character. That means, it is an ASCII character, but not a control 337 * character. 338 * 339 * @param c The character to check. 340 * @return {@code true} if the given character is a printable ASCII 341 * character, {@code false} otherwise. 342 */ 343 public static final boolean isPrintableASCIICharacter( final char c ) 344 { 345 return isPrintableASCIICharacter( (int) c ); 346 } // isPrintableASCIICharacter() 347 348 /** 349 * Returns {@code true} if the given code point represents a printable 350 * ASCII character. That means, it is an ASCII character, but not a 351 * control character. 352 * 353 * @param codePoint The code point to check. 354 * @return {@code true} if the given code point represents a printable 355 * ASCII character, {@code false} otherwise. 356 */ 357 public static final boolean isPrintableASCIICharacter( final int codePoint ) 358 { 359 final var retValue = !isISOControl(codePoint) && (codePoint >= MIN_CODE_POINT) && (codePoint < 0x80); 360 361 //---* Done *---------------------------------------------------------- 362 return retValue; 363 } // isPrintableASCIICharacter() 364 365 /** 366 * Extracts the escape sequence from the given chunk, write the result to 367 * the buffer and returns the offset. 368 * 369 * @param buffer The target buffer. 370 * @param pattern The regex pattern for the check. 371 * @param chunk The chunk to check. 372 * @return The offset; one of 1, 6, or 12. 373 */ 374 private static final int extractEscapeSequence( final StringBuilder buffer, final Pattern pattern, final CharSequence chunk ) 375 { 376 var retValue = 1; 377 if( chunk.length() >= 6 ) 378 { 379 final var c1 = chunk.subSequence( 0, 6 ); 380 if( pattern.matcher( c1 ).matches() ) 381 { 382 if( (chunk.length() == 12) && pattern.matcher( chunk.subSequence( 6, 12 ) ).matches() ) 383 { 384 try 385 { 386 buffer.append( unescapeUnicode( chunk ) ); 387 retValue = 12; 388 } 389 catch( final ValidationException ignored ) { /* Deliberately ignored */ } 390 } 391 392 if( retValue == 1 ) 393 { 394 try 395 { 396 buffer.append( unescapeUnicode( c1 ) ); 397 retValue = 6; 398 } 399 catch( final ValidationException ignored ) { /* Deliberately ignored */ } 400 } 401 } 402 } 403 404 if( retValue == 1 ) buffer.append( chunk.charAt( 0 ) ); 405 406 //---* Done *---------------------------------------------------------- 407 return retValue; 408 } // extractEscapeSequence() 409 410 /** 411 * Parses Strings in the format "<code>\uXXXX</code>", 412 * containing the textual representation of a single Unicode character, to 413 * the respective Unicode character. Some Unicode characters will be 414 * represented as <i>surrogate pairs</i> in Java, so the String that is 415 * returned by this method may contain more than one {@code char}.<br> 416 * <br>The input format for this method is used in Java source code 417 * Strings, in Java {@code .properties} files, in C/C++ source code, in 418 * JavaScript source, … 419 * 420 * @param input The input String with the Unicode escape sequence. 421 * @return The Unicode character. 422 * @throws ValidationException The input is {@code null}, empty, or cannot 423 * be parsed as a unicode escape sequence. 424 * 425 * @since 0.1.5 426 */ 427 @API( status = STABLE, since = "0.1.5" ) 428 public static final String unescapeUnicode( final CharSequence input ) 429 { 430 final var len = requireNotEmptyArgument( input, "input" ).length(); 431 //noinspection MagicNumber 432 if( (len != 6) && (len != 12) ) throw new ValidationException( "The length of a Unicode String must be 6 or 12 characters" ); 433 if( !input.subSequence( 0, 2 ).equals( "\\u" ) ) throw new ValidationException( "Unicode String must start with '\\u'" ); 434 435 final var msgCannotparse = "Cannot parse '%s' as a Unicode Escape String"; 436 @SuppressWarnings( "NumericCastThatLosesPrecision" ) 437 final var characters = breakString( input, 6 ) 438 .mapToInt( chunk -> 439 { 440 try 441 { 442 return Integer.parseInt( chunk.subSequence( 2, 6 ).toString(), 0x10 ); 443 } 444 catch( final NumberFormatException e ) 445 { 446 throw new ValidationException( format( msgCannotparse, input ), e ); 447 } 448 } ) 449 .mapToObj( i -> Character.valueOf( (char) i ) ) 450 .toArray( Character []::new ); 451 452 final var codePoint = switch( characters.length ) 453 { 454 case 1 -> characters [0]; 455 case 2 -> 456 { 457 if( !isSurrogatePair( characters [0], characters [1] ) ) 458 { 459 throw new ValidationException( format( msgCannotparse, input ) ); 460 } 461 yield toCodePoint( characters [0], characters [1] ); 462 } 463 default -> throw new ValidationException( format( msgCannotparse, input ) ); 464 }; 465 if( !Character.isValidCodePoint( codePoint ) ) throw new ValidationException( format( msgCannotparse, input ) ); 466 final var retValue = new String( toChars( codePoint ) ); 467 468 //---* Done *---------------------------------------------------------- 469 return retValue; 470 } // unescapeUnicode() 471} 472// class CharSetUtils 473 474/* 475 * End of File 476 */