corelib/include/ebcl/Strings.hh

465 lines
15 KiB
C++
Raw Normal View History

/******************************************************************************/
/* STRINGS AND RELATED UTILITIES **********************************************/
/******************************************************************************/
#ifndef _H_EBCL_STRINGS
#define _H_EBCL_STRINGS
#include <ebcl/Externals.hh>
#include <ebcl/Pointers.hh>
#include <ebcl/Arrays.hh>
#include <ebcl/Buffers.hh>
namespace ebcl {
/*= UTF-8 UTILITY FUNCTIONS ==================================================*/
// Is the specified C string valid UTF-8 ?
bool UTF8IsValid( char const* string );
// Get the length (in characters) of the specified UTF-8 0-terminated string
uint32_t UTF8Length( char const* string );
// Get the size (in bytes) of the specified UTF-8 0-terminated string
uint32_t UTF8Size( char const* string );
// Combined function that does all the above
bool UTF8Info( char const* string , uint32_t& size , uint32_t& length );
// Check if the specified data is a valid UTF-8 string, and compute its length
// (in characters).
bool UTF8BufferInfo( char const* data , uint32_t size , uint32_t& length );
// Get the codepoint from a sequence of UTF-8 bytes. Sets "bytes" to the amount
// of bytes read from the input.
uint32_t UTF8GetCodepoint( char const* data , uint32_t& bytes );
// Similar to the above, without the amount of bytes output.
uint32_t UTF8GetCodepoint( char const* data );
// Write an UTF-8 encoded codepoint to a string, returns the amount of bytes
// that were written, or 0 if there wasn't enough space.
uint32_t UTF8PutCodepoint( char* output , uint32_t available , uint32_t codepoint );
// Get the memory offset of a codepoint in an UTF-8 sequence based on its index.
uint32_t UTF8GetMemoryOffset( char const* input , uint32_t index );
// Convert an UTF-8 sequence into an unsigned integer.
uint64_t UTF8ToUnsignedInteger( char const* input , uint32_t size ,
bool * ok = nullptr , int base = 10 , bool useSep = false ,
uint32_t separator = ' ' );
// Convert an UTF-8 sequence into a signed integer.
int64_t UTF8ToInteger( char const* input , uint32_t size ,
bool * ok = nullptr , int base = 10 , bool useSep = false ,
uint32_t separator = ' ' );
// Convert an UTF-8 sequence into a double precision floating point number. The
// sequence will be checked, converted into a C string and passed to strtod()
// for actual conversion.
double UTF8ToDouble( char const* input , uint32_t size ,
bool * ok = nullptr , uint32_t decimalPoint = '.' ,
bool useSep = false , uint32_t separator = ' ' );
/*= UNICODE CHARACTERS =======================================================*/
struct T_Character
{
using F_Map = std::function< T_Character( T_Character ) >;
const uint32_t codepoint;
constexpr T_Character( ) noexcept;
constexpr T_Character( T_Character const& other ) noexcept;
M_WITH_INT( T ) constexpr T_Character( T codepoint ) noexcept;
// ---------------------------------------------------------------------
constexpr bool isValid( ) const;
constexpr bool isAscii( ) const;
constexpr bool isControl( ) const;
constexpr bool isUppercase( ) const;
constexpr bool isLowercase( ) const;
constexpr bool isAlpha( ) const;
constexpr bool isNumeric( ) const;
constexpr bool isAlphanumeric( ) const;
constexpr bool isWhitespace( ) const;
// ---------------------------------------------------------------------
constexpr bool operator== ( T_Character const& other ) const;
constexpr bool operator!= ( T_Character const& other ) const;
constexpr bool operator< ( T_Character const& other ) const;
constexpr bool operator> ( T_Character const& other ) const;
constexpr bool operator<= ( T_Character const& other ) const;
constexpr bool operator>= ( T_Character const& other ) const;
M_WITH_INT( T ) constexpr bool operator== ( T other ) const;
M_WITH_INT( T ) constexpr bool operator!= ( T other ) const;
M_WITH_INT( T ) constexpr bool operator< ( T other ) const;
M_WITH_INT( T ) constexpr bool operator<= ( T other ) const;
M_WITH_INT( T ) constexpr bool operator> ( T other ) const;
M_WITH_INT( T ) constexpr bool operator>= ( T other ) const;
constexpr operator uint32_t ( ) const;
// ---------------------------------------------------------------------
uint32_t writeTo( char* output , uint32_t avail ) const;
// ---------------------------------------------------------------------
constexpr T_Character toUpper( ) const;
constexpr T_Character toLower( ) const;
};
M_CLASS_POINTERS( Character );
/*= IMMUTABLE UTF8 STRINGS ===================================================*/
/*
* NOTE: the *objects* are NOT immutable. The strings they contain, however,
* are.
*/
class T_StringBuilder;
class T_StringIterator;
// T_StringData - Abstract base for the various types of string storage.
class A_StringData
{
protected:
char* data_;
uint32_t size_;
bool valid_;
uint32_t length_;
A_StringData( ) = default;
A_StringData( A_StringData const& ) = delete;
A_StringData( A_StringData&& ) = delete;
public:
virtual ~A_StringData( ) = 0;
// Is valid UTF-8?
bool valid( ) const;
// Get a pointer to the data
char const * data( ) const;
// Length in characters
uint32_t length( ) const;
// Size in bytes
uint32_t size( ) const;
};
M_ABSTRACT_POINTERS( StringData );
/*----------------------------------------------------------------------------*/
// T_String - Main UTF-8 string class
class T_String
{
private:
RP_StringData data_;
public:
// Construct an empty string. No overhead whatsoever, just an assignment
T_String( ) noexcept;
// This constructor will try to:
// - use the empty string if initial is null or ""
// - use a pooled string if one matches initial
// - create a dynamic string otherwise.
// Because of this, it is relatively slow and should be avoided in
// general.
T_String( char const* initial );
T_String( T_StringBuilder&& sb );
T_String( T_StringBuilder const& sb );
// Construct a dynamic string, either using the provided memory
// or duplicating it. Doesn't use the pools at all.
T_String( char const* data , uint32_t size , bool nodup = false );
T_String( T_String const& source );
T_String( T_String&& source ) noexcept;
~T_String( );
// ---------------------------------------------------------------------
// Get a pooled string. Faster than constructing then calling
// addToPool( ) if the string is already pooled. If it isn't, it will
// be added as a dynamic string.
static T_String Pooled( char const* string );
static T_String Pooled( char const* data , uint32_t size );
// ---------------------------------------------------------------------
T_String& operator= ( T_String&& string ) noexcept;
T_String& operator= ( T_String const& string );
T_String& operator= ( T_StringBuilder&& sb );
T_String& operator= ( T_StringBuilder const& sb );
friend void swap( T_String& lhs , T_String& rhs ) noexcept;
// ---------------------------------------------------------------------
// Adds the string to the pool if it isn't pooled already. If the pool
// already contains a pooled version of this string, use it instead.
T_String& addToPool( );
// Attempts to use the pooled version of a string if it exists. If it
// doesn't, keep using the current version.
T_String& usePool( );
// ---------------------------------------------------------------------
bool valid( ) const;
uint32_t size( ) const;
uint32_t length( ) const;
char const * data( ) const;
operator bool ( ) const;
bool operator! ( ) const;
// ---------------------------------------------------------------------
// Return the character at the specified index
T_Character operator[] ( uint32_t index ) const;
// Return a substring from the left side of the string
T_String left( uint32_t count ) const;
// Return a substring from the right side of the string
T_String right( uint32_t count ) const;
// Return a substring from the specified offset
T_String substr( uint32_t offset , uint32_t count = UINT32_MAX ) const;
// Return the substring between the two specified offsets
T_String range( uint32_t start , uint32_t end ) const;
// Remove whitespace from the start and end of the string
T_String trim( ) const noexcept;
// ---------------------------------------------------------------------
// Generate a string using a function that transforms characters
T_String mapped( T_Character::F_Map f ) const noexcept;
// Convert the string to uppercase
T_String toUpper( ) const noexcept;
// Convert the string to lowercase
T_String toLower( ) const noexcept;
// ---------------------------------------------------------------------
bool equals( T_String const& other ) const;
bool equals( char const* string ) const;
int32_t compare( T_String const& other ) const;
int32_t compareIgnoreCase( T_String const& other ) const;
bool startsWith( T_String const& other ) const;
bool endsWith( T_String const& other ) const;
// Finds a sub-string. Returns -1 if it isn't found.
int32_t find( T_String const& other , uint32_t from = 0 ) const;
// Finds a character. Returns -1 if it isn't found.
int32_t find( T_Character character , uint32_t from = 0 ) const;
bool operator== ( T_String const& other ) const;
bool operator!= ( T_String const& other ) const;
bool operator< ( T_String const& other ) const;
bool operator> ( T_String const& other ) const;
bool operator>= ( T_String const& other ) const;
bool operator<= ( T_String const& other ) const;
bool operator== ( char const* string ) const;
bool operator!= ( char const* string ) const;
// ---------------------------------------------------------------------
T_String replace( T_Character initial , T_Character replacement ) const;
T_String replace( T_String const& initial , T_String const& replacement ) const;
// ---------------------------------------------------------------------
uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 ,
bool useSep = false , T_Character separator = ' ' ) const;
int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false ,
T_Character separator = ' ' ) const;
double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' ,
bool useSep = false , T_Character separator = ' ' ) const;
// ---------------------------------------------------------------------
T_StringIterator getIterator( uint32_t offset ) const;
operator T_StringIterator( ) const;
// Converts the string to an array of bytes suitable for use with the
// operating system's functions (e.g. UTF-8 C string on Linux, or
// UTF-16 strings on Windows)
T_Buffer< char > toOSString( ) const;
};
M_CLASS_POINTERS( String );
M_DECLARE_HASH( T_String );
M_DECLARE_COMPARATOR( T_String );
void swap( T_String& lhs , T_String& rhs ) noexcept;
extern template class T_Array< T_String >;
/*= STRING ITERATORS =========================================================*/
class T_StringIterator final
{
friend class T_String;
private:
RP_StringData data_;
uint32_t pos_;
uint32_t index_;
uint32_t codepoint_;
uint32_t bytes_;
T_StringIterator( RP_StringData data , uint32_t index );
public:
T_StringIterator( ) = delete;
T_StringIterator( T_StringIterator const& other );
T_StringIterator( T_StringIterator&& other ) noexcept;
~T_StringIterator( );
T_StringIterator& operator= ( T_StringIterator const& other );
T_StringIterator& operator= ( T_StringIterator&& other ) noexcept;
friend void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept;
// ---------------------------------------------------------------------
bool next( );
uint32_t index( ) const;
bool atEnd( ) const;
T_Character character( ) const;
operator T_Character ( ) const;
};
M_CLASS_POINTERS( StringIterator );
void swap( T_StringIterator& lhs , T_StringIterator& rhs ) noexcept;
/*= STRING BUILDERS ==========================================================*/
class T_StringBuilder
{
public:
enum : uint32_t { C_GROWTH = 32 };
private:
char* data_;
uint32_t capacity_;
uint32_t size_;
uint32_t length_;
friend class T_String;
public:
T_StringBuilder( ) noexcept;
T_StringBuilder( T_StringBuilder const& other );
T_StringBuilder( T_StringBuilder&& other ) noexcept;
T_StringBuilder( char const* data , uint32_t size );
explicit T_StringBuilder( T_String const& string );
explicit T_StringBuilder( char const* string );
~T_StringBuilder( );
T_StringBuilder& operator =( T_StringBuilder const& other );
T_StringBuilder& operator =( T_StringBuilder&& other ) noexcept;
friend void swap( T_StringBuilder& lhs , T_StringBuilder& rhs );
// ---------------------------------------------------------------------
char const * data( ) const;
uint32_t capacity( ) const;
uint32_t size( ) const;
uint32_t length( ) const;
operator bool ( ) const;
bool operator! ( ) const;
// ---------------------------------------------------------------------
T_StringBuilder& ensureCapacity( uint32_t minCap );
T_StringBuilder& clear( );
T_StringBuilder& free( );
2017-11-29 09:24:51 +01:00
T_StringBuilder& truncate( uint32_t maxLength ) noexcept;
// ---------------------------------------------------------------------
T_StringBuilder& append( T_StringBuilder const& other );
T_StringBuilder& append( T_StringBuilder&& other );
T_StringBuilder& append( T_String const& string );
T_StringBuilder& append( char const* string , uint32_t size );
T_StringBuilder& append( char character );
T_StringBuilder& append( T_Character character );
T_StringBuilder& appendNumeric( int64_t value , int base = 10 , bool useSep = false ,
T_Character sep = ' ' , int sepEvery = 3 );
T_StringBuilder& appendNumeric( uint64_t value , int base = 10 , bool useSep = false ,
T_Character sep = ' ' , int sepEvery = 3 );
T_StringBuilder& appendDouble( double value , uint32_t precision = 6 , bool trailingZeros = false );
// ---------------------------------------------------------------------
bool operator== ( T_StringBuilder const& other ) const;
bool operator!= ( T_StringBuilder const& other ) const;
bool operator== ( T_String const& string ) const;
bool operator!= ( T_String const& string ) const;
bool operator== ( char const* string ) const;
bool operator!= ( char const* string ) const;
// ---------------------------------------------------------------------
uint64_t toUnsignedInteger( bool * ok = nullptr , int base = 10 ,
bool useSep = false , T_Character separator = ' ' ) const;
int64_t toInteger( bool * ok = nullptr , int base = 10 , bool useSep = false ,
T_Character separator = ' ' ) const;
double toDouble( bool * ok = nullptr , T_Character decimalPoint = '.' ,
bool useSep = false , T_Character separator = ' ' ) const;
};
M_CLASS_POINTERS( StringBuilder );
void swap( T_StringBuilder& lhs , T_StringBuilder& rhs );
// Operator <<
M_LSHIFT_OP( T_StringBuilder , T_StringBuilder const& );
M_LSHIFT_OP( T_StringBuilder , T_StringBuilder && );
M_LSHIFT_OP( T_StringBuilder , T_String const& );
M_LSHIFT_OP( T_StringBuilder , char const* );
M_LSHIFT_OP( T_StringBuilder , char );
M_LSHIFT_OP( T_StringBuilder , T_Character );
M_LSHIFT_OP( T_StringBuilder , int16_t );
M_LSHIFT_OP( T_StringBuilder , int32_t );
M_LSHIFT_OP( T_StringBuilder , int64_t );
M_LSHIFT_OP( T_StringBuilder , uint16_t );
M_LSHIFT_OP( T_StringBuilder , uint32_t );
M_LSHIFT_OP( T_StringBuilder , uint64_t );
M_LSHIFT_OP( T_StringBuilder , float );
M_LSHIFT_OP( T_StringBuilder , double );
} // namespace
#endif // _H_EBCL_STRINGS
#include <ebcl/inline/Strings.hh>