/******************************************************************************/ /* STRINGS AND RELATED UTILITIES **********************************************/ /******************************************************************************/ #ifndef _H_EBCL_STRINGS #define _H_EBCL_STRINGS #include #include #include #include namespace ebcl { /*= UTF-8 UTILITY FUNCTIONS ==================================================*/ // Is the specified C string valid UTF-8 ? bool UTF8IsValid( char const* string ); // Get the length (in characters) of the specified UTF-8 0-terminated string uint32_t UTF8Length( char const* string ); // Get the size (in bytes) of the specified UTF-8 0-terminated string uint32_t UTF8Size( char const* string ); // Combined function that does all the above bool UTF8Info( char const* string , uint32_t& size , uint32_t& length ); // Check if the specified data is a valid UTF-8 string, and compute its length // (in characters). bool UTF8BufferInfo( char const* data , uint32_t size , uint32_t& length ); // Get the codepoint from a sequence of UTF-8 bytes. Sets "bytes" to the amount // of bytes read from the input. uint32_t UTF8GetCodepoint( char const* data , uint32_t& bytes ); // Similar to the above, without the amount of bytes output. uint32_t UTF8GetCodepoint( char const* data ); // Write an UTF-8 encoded codepoint to a string, returns the amount of bytes // that were written, or 0 if there wasn't enough space. uint32_t UTF8PutCodepoint( char* output , uint32_t available , uint32_t codepoint ); // Get the memory offset of a codepoint in an UTF-8 sequence based on its index. uint32_t UTF8GetMemoryOffset( char const* input , uint32_t index ); // Convert an UTF-8 sequence into an unsigned integer. uint64_t UTF8ToUnsignedInteger( char const* input , uint32_t size , bool * ok = nullptr , int base = 10 , bool useSep = false , uint32_t separator = ' ' ); // Convert an UTF-8 sequence into a signed integer. int64_t UTF8ToInteger( char const* input , uint32_t size , bool * ok = nullptr , int base = 10 , bool useSep = false , uint32_t separator = ' ' ); // Convert an UTF-8 sequence into a double precision floating point number. The // sequence will be checked, converted into a C string and passed to strtod() // for actual conversion. double UTF8ToDouble( char const* input , uint32_t size , bool * ok = nullptr , uint32_t decimalPoint = '.' , bool useSep = false , uint32_t separator = ' ' ); /*= UNICODE CHARACTERS =======================================================*/ struct T_Character { using F_Map = std::function< T_Character( T_Character ) >; const uint32_t codepoint; constexpr T_Character( ) noexcept; constexpr T_Character( T_Character const& other ) noexcept; M_WITH_INT( T ) constexpr T_Character( T codepoint ) noexcept; // --------------------------------------------------------------------- constexpr bool isValid( ) const; constexpr bool isAscii( ) const; constexpr bool isControl( ) const; constexpr bool isUppercase( ) const; constexpr bool isLowercase( ) const; constexpr bool isAlpha( ) const; constexpr bool isNumeric( ) const; constexpr bool isAlphanumeric( ) const; constexpr bool isWhitespace( ) const; // --------------------------------------------------------------------- constexpr bool operator== ( T_Character const& other ) const; constexpr bool operator!= ( T_Character const& other ) const; constexpr bool operator< ( T_Character const& other ) const; constexpr bool operator> ( T_Character const& other ) const; constexpr bool operator<= ( T_Character const& other ) const; constexpr bool operator>= ( T_Character const& other ) const; M_WITH_INT( T ) constexpr bool operator== ( T other ) const; M_WITH_INT( T ) constexpr bool operator!= ( T other ) const; M_WITH_INT( T ) constexpr bool operator< ( T other ) const; M_WITH_INT( T ) constexpr bool operator<= ( T other ) const; M_WITH_INT( T ) constexpr bool operator> ( T other ) const; M_WITH_INT( T ) constexpr bool operator>= ( T other ) const; constexpr operator uint32_t ( ) const; // --------------------------------------------------------------------- uint32_t writeTo( char* output , uint32_t avail ) const; // --------------------------------------------------------------------- constexpr T_Character toUpper( ) const; constexpr T_Character toLower( ) const; }; M_CLASS_POINTERS( Character ); /*= IMMUTABLE UTF8 STRINGS ===================================================*/ /* * NOTE: the *objects* are NOT immutable. The strings they contain, however, * are. */ class T_StringBuilder; class T_StringIterator; // T_StringData - Abstract base for the various types of string storage. class A_StringData { protected: char* data_; uint32_t size_; bool valid_; uint32_t length_; A_StringData( ) = default; A_StringData( A_StringData const& ) = delete; A_StringData( A_StringData&& ) = delete; public: virtual ~A_StringData( ) = 0; // Is valid UTF-8? bool valid( ) const; // Get a pointer to the data char const * data( ) const; // Length in characters uint32_t length( ) const; // Size in bytes uint32_t size( ) const; }; M_ABSTRACT_POINTERS( StringData ); /*----------------------------------------------------------------------------*/ // T_String - Main UTF-8 string class class T_String { private: RP_StringData data_; public: // Construct an empty string. No overhead whatsoever, just an assignment T_String( ) noexcept; // This constructor will try to: // - use the empty string if initial is null or "" // - use a pooled string if one matches initial // - create a dynamic string otherwise. // Because of this, it is relatively slow and should be avoided in // general. T_String( char const* initial ); T_String( T_StringBuilder&& sb ); T_String( T_StringBuilder const& sb ); // Construct a dynamic string, either using the provided memory // or duplicating it. Doesn't use the pools at all. T_String( char const* data , uint32_t size , bool nodup = false ); T_String( T_String const& source ); T_String( T_String&& source ) noexcept; ~T_String( ); // --------------------------------------------------------------------- // Get a pooled string. Faster than constructing then calling // addToPool( ) if the string is already pooled. If it isn't, it will // be added as a dynamic string. static T_String Pooled( char const* string ); static T_String Pooled( char const* data , uint32_t size ); // --------------------------------------------------------------------- T_String& operator= ( T_String&& string ) noexcept; T_String& operator= ( T_String const& string ); T_String& operator= ( T_StringBuilder&& sb ); T_String& operator= ( T_StringBuilder const& sb ); friend void swap( T_String& lhs , T_String& rhs ) noexcept; // --------------------------------------------------------------------- // Adds the string to the pool if it isn't pooled already. If the pool // already contains a pooled version of this string, use it instead. T_String& addToPool( ); // Attempts to use the pooled version of a string if it exists. If it // doesn't, keep using the current version. T_String& usePool( ); // --------------------------------------------------------------------- bool valid( ) const; uint32_t size( ) const; uint32_t length( ) const; char const * data( ) const; operator bool ( ) const; bool operator! ( ) const; // --------------------------------------------------------------------- // Return the character at the specified index T_Character operator[] ( uint32_t index ) const; // Return a substring from the left side of the string T_String left( uint32_t count ) const; // Return a substring from the right side of the string T_String right( uint32_t count ) const; // Return a substring from the specified offset T_String substr( uint32_t offset , uint32_t count = UINT32_MAX ) const; // Return the substring between the two specified offsets T_String range( uint32_t start , uint32_t end ) const; // Remove whitespace from the start and end of the string T_String trim( ) const noexcept; // --------------------------------------------------------------------- // Generate a string using a function that transforms characters T_String mapped( T_Character::F_Map f ) const noexcept; // Convert the string to uppercase T_String toUpper( ) const noexcept; // Convert the string to lowercase T_String toLower( ) const noexcept; // --------------------------------------------------------------------- bool equals( T_String const& other ) const; bool equals( char const* string ) const; int32_t compare( T_String const& other ) const; int32_t compareIgnoreCase( T_String const& other ) const; bool startsWith( T_String const& other ) const; bool endsWith( T_String const& other ) const; // Finds a sub-string. Returns -1 if it isn't found. int32_t find( T_String const& other , uint32_t from = 0 ) const; // Finds a character. Returns -1 if it isn't found. int32_t find( T_Character character , uint32_t from = 0 ) const; bool operator== ( T_String const& other ) const; bool operator!= ( T_String const& other ) const; bool operator< ( T_String const& other ) const; bool operator> ( T_String const& other ) const; bool operator>= ( T_String const& other ) const; bool operator<= ( T_String const& other ) const; bool operator== ( char const* string ) const; bool operator!= ( char const* string ) const; // --------------------------------------------------------------------- T_String replace( T_Character initial , T_Character replacement ) const; T_String replace( T_String const& initial , T_String const& replacement ) const; // --------------------------------------------------------------------- uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 , bool useSep = false , T_Character separator = ' ' ) const; int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false , T_Character separator = ' ' ) const; double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' , bool useSep = false , T_Character separator = ' ' ) const; // --------------------------------------------------------------------- T_StringIterator getIterator( uint32_t offset ) const; operator T_StringIterator( ) const; // Converts the string to an array of bytes suitable for use with the // operating system's functions (e.g. UTF-8 C string on Linux, or // UTF-16 strings on Windows) T_Buffer< char > toOSString( ) const; }; M_CLASS_POINTERS( String ); M_DECLARE_HASH( T_String ); M_DECLARE_COMPARATOR( T_String ); void swap( T_String& lhs , T_String& rhs ) noexcept; extern template class T_Array< T_String >; /*= STRING ITERATORS =========================================================*/ class T_StringIterator final { friend class T_String; private: RP_StringData data_; uint32_t pos_; uint32_t index_; uint32_t codepoint_; uint32_t bytes_; T_StringIterator( RP_StringData data , uint32_t index ); public: T_StringIterator( ) = delete; T_StringIterator( T_StringIterator const& other ); T_StringIterator( T_StringIterator&& other ) noexcept; ~T_StringIterator( ); T_StringIterator& operator= ( T_StringIterator const& other ); T_StringIterator& operator= ( T_StringIterator&& other ) noexcept; friend void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept; // --------------------------------------------------------------------- bool next( ); uint32_t index( ) const; bool atEnd( ) const; T_Character character( ) const; operator T_Character ( ) const; }; M_CLASS_POINTERS( StringIterator ); void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept; /*= STRING BUILDERS ==========================================================*/ class T_StringBuilder { public: enum : uint32_t { C_GROWTH = 32 }; private: char* data_; uint32_t capacity_; uint32_t size_; uint32_t length_; friend class T_String; public: T_StringBuilder( ) noexcept; T_StringBuilder( T_StringBuilder const& other ); T_StringBuilder( T_StringBuilder&& other ) noexcept; T_StringBuilder( char const* data , uint32_t size ); explicit T_StringBuilder( T_String const& string ); explicit T_StringBuilder( char const* string ); ~T_StringBuilder( ); T_StringBuilder& operator =( T_StringBuilder const& other ); T_StringBuilder& operator =( T_StringBuilder&& other ) noexcept; friend void swap( T_StringBuilder& lhs , T_StringBuilder& rhs ); // --------------------------------------------------------------------- char const * data( ) const; uint32_t capacity( ) const; uint32_t size( ) const; uint32_t length( ) const; operator bool ( ) const; bool operator! ( ) const; // --------------------------------------------------------------------- T_StringBuilder& ensureCapacity( uint32_t minCap ); T_StringBuilder& clear( ); T_StringBuilder& free( ); T_StringBuilder& truncate( uint32_t maxLength ) noexcept; // --------------------------------------------------------------------- T_StringBuilder& append( T_StringBuilder const& other ); T_StringBuilder& append( T_StringBuilder&& other ); T_StringBuilder& append( T_String const& string ); T_StringBuilder& append( char const* string , uint32_t size ); T_StringBuilder& append( char character ); T_StringBuilder& append( T_Character character ); T_StringBuilder& appendNumeric( int64_t value , int base = 10 , bool useSep = false , T_Character sep = ' ' , int sepEvery = 3 ); T_StringBuilder& appendNumeric( uint64_t value , int base = 10 , bool useSep = false , T_Character sep = ' ' , int sepEvery = 3 ); T_StringBuilder& appendDouble( double value , uint32_t precision = 6 , bool trailingZeros = false ); // --------------------------------------------------------------------- bool operator== ( T_StringBuilder const& other ) const; bool operator!= ( T_StringBuilder const& other ) const; bool operator== ( T_String const& string ) const; bool operator!= ( T_String const& string ) const; bool operator== ( char const* string ) const; bool operator!= ( char const* string ) const; // --------------------------------------------------------------------- uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 , bool useSep = false , T_Character separator = ' ' ) const; int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false , T_Character separator = ' ' ) const; double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' , bool useSep = false , T_Character separator = ' ' ) const; }; M_CLASS_POINTERS( StringBuilder ); void swap( T_StringBuilder& lhs , T_StringBuilder& rhs ); // Operator << M_LSHIFT_OP( T_StringBuilder , T_StringBuilder const& ); M_LSHIFT_OP( T_StringBuilder , T_StringBuilder && ); M_LSHIFT_OP( T_StringBuilder , T_String const& ); M_LSHIFT_OP( T_StringBuilder , char const* ); M_LSHIFT_OP( T_StringBuilder , char ); M_LSHIFT_OP( T_StringBuilder , T_Character ); M_LSHIFT_OP( T_StringBuilder , int16_t ); M_LSHIFT_OP( T_StringBuilder , int32_t ); M_LSHIFT_OP( T_StringBuilder , int64_t ); M_LSHIFT_OP( T_StringBuilder , uint16_t ); M_LSHIFT_OP( T_StringBuilder , uint32_t ); M_LSHIFT_OP( T_StringBuilder , uint64_t ); M_LSHIFT_OP( T_StringBuilder , float ); M_LSHIFT_OP( T_StringBuilder , double ); } // namespace #endif // _H_EBCL_STRINGS #include