2017-11-01 20:14:23 +01:00
|
|
|
/******************************************************************************/
|
|
|
|
/* STRINGS AND RELATED UTILITIES **********************************************/
|
|
|
|
/******************************************************************************/
|
|
|
|
|
2017-11-01 21:44:54 +01:00
|
|
|
#ifndef _H_EBCL_STRINGS
|
|
|
|
#define _H_EBCL_STRINGS
|
|
|
|
#include <ebcl/Externals.hh>
|
|
|
|
#include <ebcl/Pointers.hh>
|
|
|
|
#include <ebcl/Arrays.hh>
|
2017-12-27 19:00:56 +01:00
|
|
|
#include <ebcl/Buffers.hh>
|
2017-11-01 21:44:54 +01:00
|
|
|
namespace ebcl {
|
2017-11-01 20:14:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
/*= UTF-8 UTILITY FUNCTIONS ==================================================*/
|
|
|
|
|
|
|
|
// Is the specified C string valid UTF-8 ?
|
|
|
|
bool UTF8IsValid( char const* string );
|
|
|
|
|
|
|
|
// Get the length (in characters) of the specified UTF-8 0-terminated string
|
|
|
|
uint32_t UTF8Length( char const* string );
|
|
|
|
|
|
|
|
// Get the size (in bytes) of the specified UTF-8 0-terminated string
|
|
|
|
uint32_t UTF8Size( char const* string );
|
|
|
|
|
|
|
|
// Combined function that does all the above
|
|
|
|
bool UTF8Info( char const* string , uint32_t& size , uint32_t& length );
|
|
|
|
|
|
|
|
// Check if the specified data is a valid UTF-8 string, and compute its length
|
|
|
|
// (in characters).
|
|
|
|
bool UTF8BufferInfo( char const* data , uint32_t size , uint32_t& length );
|
|
|
|
|
|
|
|
// Get the codepoint from a sequence of UTF-8 bytes. Sets "bytes" to the amount
|
|
|
|
// of bytes read from the input.
|
|
|
|
uint32_t UTF8GetCodepoint( char const* data , uint32_t& bytes );
|
|
|
|
|
|
|
|
// Similar to the above, without the amount of bytes output.
|
|
|
|
uint32_t UTF8GetCodepoint( char const* data );
|
|
|
|
|
|
|
|
// Write an UTF-8 encoded codepoint to a string, returns the amount of bytes
|
|
|
|
// that were written, or 0 if there wasn't enough space.
|
|
|
|
uint32_t UTF8PutCodepoint( char* output , uint32_t available , uint32_t codepoint );
|
|
|
|
|
|
|
|
// Get the memory offset of a codepoint in an UTF-8 sequence based on its index.
|
|
|
|
uint32_t UTF8GetMemoryOffset( char const* input , uint32_t index );
|
|
|
|
|
|
|
|
// Convert an UTF-8 sequence into an unsigned integer.
|
|
|
|
uint64_t UTF8ToUnsignedInteger( char const* input , uint32_t size ,
|
|
|
|
bool * ok = nullptr , int base = 10 , bool useSep = false ,
|
|
|
|
uint32_t separator = ' ' );
|
|
|
|
|
|
|
|
// Convert an UTF-8 sequence into a signed integer.
|
|
|
|
int64_t UTF8ToInteger( char const* input , uint32_t size ,
|
|
|
|
bool * ok = nullptr , int base = 10 , bool useSep = false ,
|
|
|
|
uint32_t separator = ' ' );
|
|
|
|
|
|
|
|
// Convert an UTF-8 sequence into a double precision floating point number. The
|
|
|
|
// sequence will be checked, converted into a C string and passed to strtod()
|
|
|
|
// for actual conversion.
|
|
|
|
double UTF8ToDouble( char const* input , uint32_t size ,
|
|
|
|
bool * ok = nullptr , uint32_t decimalPoint = '.' ,
|
|
|
|
bool useSep = false , uint32_t separator = ' ' );
|
|
|
|
|
|
|
|
|
|
|
|
/*= UNICODE CHARACTERS =======================================================*/
|
|
|
|
|
|
|
|
struct T_Character
|
|
|
|
{
|
2018-05-08 13:58:39 +02:00
|
|
|
using F_Map = std::function< T_Character( T_Character ) >;
|
|
|
|
|
2017-11-01 20:14:23 +01:00
|
|
|
const uint32_t codepoint;
|
|
|
|
|
|
|
|
T_Character( ) noexcept;
|
|
|
|
T_Character( T_Character const& other ) noexcept;
|
|
|
|
M_WITH_INT( T ) T_Character( T codepoint ) noexcept;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
bool isValid( ) const;
|
|
|
|
bool isAscii( ) const;
|
|
|
|
bool isControl( ) const;
|
|
|
|
bool isUppercase( ) const;
|
|
|
|
bool isLowercase( ) const;
|
|
|
|
bool isAlpha( ) const;
|
|
|
|
bool isNumeric( ) const;
|
|
|
|
bool isAlphanumeric( ) const;
|
|
|
|
bool isWhitespace( ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
bool operator== ( T_Character const& other ) const;
|
|
|
|
bool operator!= ( T_Character const& other ) const;
|
|
|
|
bool operator< ( T_Character const& other ) const;
|
|
|
|
bool operator> ( T_Character const& other ) const;
|
|
|
|
bool operator<= ( T_Character const& other ) const;
|
|
|
|
bool operator>= ( T_Character const& other ) const;
|
|
|
|
|
|
|
|
M_WITH_INT( T ) bool operator== ( T other ) const;
|
|
|
|
M_WITH_INT( T ) bool operator!= ( T other ) const;
|
|
|
|
M_WITH_INT( T ) bool operator< ( T other ) const;
|
|
|
|
M_WITH_INT( T ) bool operator<= ( T other ) const;
|
|
|
|
M_WITH_INT( T ) bool operator> ( T other ) const;
|
|
|
|
M_WITH_INT( T ) bool operator>= ( T other ) const;
|
|
|
|
|
|
|
|
operator uint32_t ( ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
uint32_t writeTo( char* output , uint32_t avail ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_Character toUpper( ) const;
|
|
|
|
T_Character toLower( ) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
M_CLASS_POINTERS( Character );
|
|
|
|
|
|
|
|
|
|
|
|
/*= IMMUTABLE UTF8 STRINGS ===================================================*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NOTE: the *objects* are NOT immutable. The strings they contain, however,
|
|
|
|
* are.
|
|
|
|
*/
|
|
|
|
|
|
|
|
class T_StringBuilder;
|
|
|
|
class T_StringIterator;
|
|
|
|
|
|
|
|
// T_StringData - Abstract base for the various types of string storage.
|
|
|
|
class A_StringData
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
char* data_;
|
|
|
|
uint32_t size_;
|
|
|
|
bool valid_;
|
|
|
|
uint32_t length_;
|
|
|
|
|
|
|
|
A_StringData( ) = default;
|
|
|
|
A_StringData( A_StringData const& ) = delete;
|
|
|
|
A_StringData( A_StringData&& ) = delete;
|
|
|
|
|
|
|
|
public:
|
|
|
|
virtual ~A_StringData( ) = 0;
|
|
|
|
|
|
|
|
// Is valid UTF-8?
|
|
|
|
bool valid( ) const;
|
|
|
|
// Get a pointer to the data
|
|
|
|
char const * data( ) const;
|
|
|
|
// Length in characters
|
|
|
|
uint32_t length( ) const;
|
|
|
|
// Size in bytes
|
|
|
|
uint32_t size( ) const;
|
|
|
|
};
|
|
|
|
M_ABSTRACT_POINTERS( StringData );
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
// T_String - Main UTF-8 string class
|
|
|
|
class T_String
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
RP_StringData data_;
|
|
|
|
|
|
|
|
public:
|
|
|
|
// Construct an empty string. No overhead whatsoever, just an assignment
|
|
|
|
T_String( ) noexcept;
|
|
|
|
|
|
|
|
// This constructor will try to:
|
|
|
|
// - use the empty string if initial is null or ""
|
|
|
|
// - use a pooled string if one matches initial
|
|
|
|
// - create a dynamic string otherwise.
|
|
|
|
// Because of this, it is relatively slow and should be avoided in
|
|
|
|
// general.
|
2017-11-02 10:40:26 +01:00
|
|
|
T_String( char const* initial );
|
2017-11-01 20:14:23 +01:00
|
|
|
|
|
|
|
T_String( T_StringBuilder&& sb );
|
|
|
|
T_String( T_StringBuilder const& sb );
|
|
|
|
|
|
|
|
// Construct a dynamic string, either using the provided memory
|
|
|
|
// or duplicating it. Doesn't use the pools at all.
|
|
|
|
T_String( char const* data , uint32_t size , bool nodup = false );
|
|
|
|
|
|
|
|
T_String( T_String const& source );
|
|
|
|
T_String( T_String&& source ) noexcept;
|
|
|
|
|
|
|
|
~T_String( );
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Get a pooled string. Faster than constructing then calling
|
|
|
|
// addToPool( ) if the string is already pooled. If it isn't, it will
|
|
|
|
// be added as a dynamic string.
|
|
|
|
static T_String Pooled( char const* string );
|
|
|
|
static T_String Pooled( char const* data , uint32_t size );
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_String& operator= ( T_String&& string ) noexcept;
|
|
|
|
T_String& operator= ( T_String const& string );
|
|
|
|
|
|
|
|
T_String& operator= ( T_StringBuilder&& sb );
|
|
|
|
T_String& operator= ( T_StringBuilder const& sb );
|
|
|
|
|
|
|
|
friend void swap( T_String& lhs , T_String& rhs ) noexcept;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Adds the string to the pool if it isn't pooled already. If the pool
|
|
|
|
// already contains a pooled version of this string, use it instead.
|
|
|
|
T_String& addToPool( );
|
|
|
|
|
|
|
|
// Attempts to use the pooled version of a string if it exists. If it
|
|
|
|
// doesn't, keep using the current version.
|
|
|
|
T_String& usePool( );
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
bool valid( ) const;
|
|
|
|
uint32_t size( ) const;
|
|
|
|
uint32_t length( ) const;
|
|
|
|
char const * data( ) const;
|
|
|
|
operator bool ( ) const;
|
|
|
|
bool operator! ( ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Return the character at the specified index
|
|
|
|
T_Character operator[] ( uint32_t index ) const;
|
|
|
|
|
|
|
|
// Return a substring from the left side of the string
|
|
|
|
T_String left( uint32_t count ) const;
|
|
|
|
// Return a substring from the right side of the string
|
|
|
|
T_String right( uint32_t count ) const;
|
|
|
|
// Return a substring from the specified offset
|
|
|
|
T_String substr( uint32_t offset , uint32_t count = UINT32_MAX ) const;
|
|
|
|
// Return the substring between the two specified offsets
|
|
|
|
T_String range( uint32_t start , uint32_t end ) const;
|
|
|
|
|
|
|
|
// Remove whitespace from the start and end of the string
|
|
|
|
T_String trim( ) const noexcept;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
2018-05-08 13:58:39 +02:00
|
|
|
// Generate a string using a function that transforms characters
|
|
|
|
T_String mapped( T_Character::F_Map f ) const noexcept;
|
|
|
|
|
|
|
|
// Convert the string to uppercase
|
|
|
|
T_String toUpper( ) const noexcept;
|
|
|
|
// Convert the string to lowercase
|
|
|
|
T_String toLower( ) const noexcept;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
2017-11-01 20:14:23 +01:00
|
|
|
bool equals( T_String const& other ) const;
|
|
|
|
bool equals( char const* string ) const;
|
|
|
|
|
|
|
|
int32_t compare( T_String const& other ) const;
|
|
|
|
int32_t compareIgnoreCase( T_String const& other ) const;
|
|
|
|
|
|
|
|
bool startsWith( T_String const& other ) const;
|
|
|
|
bool endsWith( T_String const& other ) const;
|
|
|
|
|
|
|
|
// Finds a sub-string. Returns -1 if it isn't found.
|
|
|
|
int32_t find( T_String const& other , uint32_t from = 0 ) const;
|
|
|
|
// Finds a character. Returns -1 if it isn't found.
|
|
|
|
int32_t find( T_Character character , uint32_t from = 0 ) const;
|
|
|
|
|
|
|
|
bool operator== ( T_String const& other ) const;
|
|
|
|
bool operator!= ( T_String const& other ) const;
|
|
|
|
bool operator< ( T_String const& other ) const;
|
|
|
|
bool operator> ( T_String const& other ) const;
|
|
|
|
bool operator>= ( T_String const& other ) const;
|
|
|
|
bool operator<= ( T_String const& other ) const;
|
|
|
|
|
|
|
|
bool operator== ( char const* string ) const;
|
|
|
|
bool operator!= ( char const* string ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_String replace( T_Character initial , T_Character replacement ) const;
|
|
|
|
T_String replace( T_String const& initial , T_String const& replacement ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 ,
|
|
|
|
bool useSep = false , T_Character separator = ' ' ) const;
|
|
|
|
int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false ,
|
|
|
|
T_Character separator = ' ' ) const;
|
|
|
|
double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' ,
|
|
|
|
bool useSep = false , T_Character separator = ' ' ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_StringIterator getIterator( uint32_t offset ) const;
|
|
|
|
operator T_StringIterator( ) const;
|
|
|
|
|
|
|
|
// Converts the string to an array of bytes suitable for use with the
|
|
|
|
// operating system's functions (e.g. UTF-8 C string on Linux, or
|
|
|
|
// UTF-16 strings on Windows)
|
|
|
|
T_Buffer< char > toOSString( ) const;
|
|
|
|
};
|
|
|
|
M_CLASS_POINTERS( String );
|
|
|
|
M_DECLARE_HASH( T_String );
|
|
|
|
M_DECLARE_COMPARATOR( T_String );
|
|
|
|
|
|
|
|
void swap( T_String& lhs , T_String& rhs ) noexcept;
|
|
|
|
|
|
|
|
extern template class T_Array< T_String >;
|
|
|
|
extern template class T_MultiArray< T_String >;
|
|
|
|
|
|
|
|
|
|
|
|
/*= STRING ITERATORS =========================================================*/
|
|
|
|
|
|
|
|
class T_StringIterator final
|
|
|
|
{
|
|
|
|
friend class T_String;
|
|
|
|
|
|
|
|
private:
|
|
|
|
RP_StringData data_;
|
|
|
|
uint32_t pos_;
|
|
|
|
uint32_t index_;
|
|
|
|
uint32_t codepoint_;
|
|
|
|
uint32_t bytes_;
|
|
|
|
|
|
|
|
T_StringIterator( RP_StringData data , uint32_t index );
|
|
|
|
|
|
|
|
public:
|
|
|
|
T_StringIterator( ) = delete;
|
|
|
|
T_StringIterator( T_StringIterator const& other );
|
|
|
|
T_StringIterator( T_StringIterator&& other ) noexcept;
|
|
|
|
|
|
|
|
~T_StringIterator( );
|
|
|
|
|
|
|
|
T_StringIterator& operator= ( T_StringIterator const& other );
|
|
|
|
T_StringIterator& operator= ( T_StringIterator&& other ) noexcept;
|
|
|
|
|
|
|
|
friend void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
bool next( );
|
|
|
|
|
|
|
|
uint32_t index( ) const;
|
|
|
|
bool atEnd( ) const;
|
|
|
|
|
|
|
|
T_Character character( ) const;
|
|
|
|
operator T_Character ( ) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
M_CLASS_POINTERS( StringIterator );
|
|
|
|
void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept;
|
|
|
|
|
|
|
|
|
|
|
|
/*= STRING BUILDERS ==========================================================*/
|
|
|
|
|
|
|
|
class T_StringBuilder
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
enum : uint32_t { C_GROWTH = 32 };
|
|
|
|
|
|
|
|
private:
|
|
|
|
char* data_;
|
|
|
|
uint32_t capacity_;
|
|
|
|
uint32_t size_;
|
|
|
|
uint32_t length_;
|
|
|
|
|
|
|
|
friend class T_String;
|
|
|
|
|
|
|
|
public:
|
|
|
|
T_StringBuilder( ) noexcept;
|
|
|
|
|
|
|
|
T_StringBuilder( T_StringBuilder const& other );
|
|
|
|
T_StringBuilder( T_StringBuilder&& other ) noexcept;
|
|
|
|
|
|
|
|
T_StringBuilder( char const* data , uint32_t size );
|
|
|
|
explicit T_StringBuilder( T_String const& string );
|
|
|
|
explicit T_StringBuilder( char const* string );
|
|
|
|
|
|
|
|
~T_StringBuilder( );
|
|
|
|
|
|
|
|
T_StringBuilder& operator =( T_StringBuilder const& other );
|
|
|
|
T_StringBuilder& operator =( T_StringBuilder&& other ) noexcept;
|
|
|
|
|
|
|
|
friend void swap( T_StringBuilder& lhs , T_StringBuilder& rhs );
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
char const * data( ) const;
|
|
|
|
uint32_t capacity( ) const;
|
|
|
|
uint32_t size( ) const;
|
|
|
|
uint32_t length( ) const;
|
|
|
|
operator bool ( ) const;
|
|
|
|
bool operator! ( ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_StringBuilder& ensureCapacity( uint32_t minCap );
|
|
|
|
T_StringBuilder& clear( );
|
|
|
|
T_StringBuilder& free( );
|
2017-11-29 09:24:51 +01:00
|
|
|
T_StringBuilder& truncate( uint32_t maxLength ) noexcept;
|
2017-11-01 20:14:23 +01:00
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
T_StringBuilder& append( T_StringBuilder const& other );
|
|
|
|
T_StringBuilder& append( T_StringBuilder&& other );
|
|
|
|
T_StringBuilder& append( T_String const& string );
|
|
|
|
T_StringBuilder& append( char const* string , uint32_t size );
|
|
|
|
T_StringBuilder& append( char character );
|
|
|
|
T_StringBuilder& append( T_Character character );
|
|
|
|
|
|
|
|
T_StringBuilder& appendNumeric( int64_t value , int base = 10 , bool useSep = false ,
|
|
|
|
T_Character sep = ' ' , int sepEvery = 3 );
|
|
|
|
|
|
|
|
T_StringBuilder& appendNumeric( uint64_t value , int base = 10 , bool useSep = false ,
|
|
|
|
T_Character sep = ' ' , int sepEvery = 3 );
|
|
|
|
|
|
|
|
T_StringBuilder& appendDouble( double value , uint32_t precision = 6 , bool trailingZeros = false );
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
bool operator== ( T_StringBuilder const& other ) const;
|
|
|
|
bool operator!= ( T_StringBuilder const& other ) const;
|
|
|
|
|
|
|
|
bool operator== ( T_String const& string ) const;
|
|
|
|
bool operator!= ( T_String const& string ) const;
|
|
|
|
|
|
|
|
bool operator== ( char const* string ) const;
|
|
|
|
bool operator!= ( char const* string ) const;
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 ,
|
|
|
|
bool useSep = false , T_Character separator = ' ' ) const;
|
|
|
|
int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false ,
|
|
|
|
T_Character separator = ' ' ) const;
|
|
|
|
double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' ,
|
|
|
|
bool useSep = false , T_Character separator = ' ' ) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
M_CLASS_POINTERS( StringBuilder );
|
|
|
|
void swap( T_StringBuilder& lhs , T_StringBuilder& rhs );
|
|
|
|
|
|
|
|
// Operator <<
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , T_StringBuilder const& );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , T_StringBuilder && );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , T_String const& );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , char const* );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , char );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , T_Character );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , int16_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , int32_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , int64_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , uint16_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , uint32_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , uint64_t );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , float );
|
|
|
|
M_LSHIFT_OP( T_StringBuilder , double );
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace
|
2017-11-01 21:44:54 +01:00
|
|
|
#endif // _H_EBCL_STRINGS
|
|
|
|
#include <ebcl/inline/Strings.hh>
|