xchar.h

// xchar.h
//
// Author David Barrett-Lennard
// (C)opyright Cedanet Pty Ltd 2010

#pragma once
#ifndef Ceda_cxUtils_xchar_H
#define Ceda_cxUtils_xchar_H

#include <cctype>
#include <stdint.h>

#ifdef _WIN32
    #define as_LPCWSTR(s)        (const wchar_t*) ceda::AsString16(s).c_str()
    #define as_LPCWSTRn(s,len)   (const wchar_t*) ceda::AsString16(s,len).c_str()
#endif

#define as_ConstStringZ(s)  ceda::AsXstring(s).c_str()

#ifdef _WIN32
    #define CEDA_MAIN_FUNCTION(argc,argv)                                     \
        int main_u8( int argc, const char* argv[] );                          \
        int wmain( int argc, wchar_t *argv[] )                                \
        {                                                                     \
            ceda::xvector<ceda::xstring> strings_u8(argc);                    \
            ceda::xvector<const char*> argv_u8(argc+1);                       \
            for (int i=0 ; i < argc ; ++i)                                    \
            {                                                                 \
                strings_u8[i] = ceda::AsString8(argv[i]);                     \
                argv_u8[i] = strings_u8[i].c_str();                           \
            }                                                                 \
            argv_u8[argc] = nullptr;                                          \
            return main_u8(argc, argv_u8.data());                             \
        }                                                                     \
        int main_u8( int argc, const char* argv[] )
#else
    #define CEDA_MAIN_FUNCTION(argc,argv)                                     \
        int main(int argc, char* argv[])
#endif

/*
Proposal
--------

A problem with standard C++ is the convention that 'char' represents one of the signed 
or unsigned integer types (it's platform dependent), and yet it's used in ANSI character 
strings, and at least conceptually characters and numbers are very different things. This 
is very clear when we want to write a character to an ostream and have it output a 
character rather than a base 10 representation of an integer (say).

On the Intel platform using MSVC the following are all perfect aliases:

    signed char
    char
    __int8
    signed __int8

Therefore currently on this platform ceda::char8 and ceda::int8 are identical types.   

This causes a number of problems:

*   Platform dependent inconsistency with how the integer types are written to an 
    ostream.   On our platform int8 is written as a character whereas all the others 
    (int16, int32, int64, uint8, uint16, uint32, uint64 are written out as an integer).
    
*   Since int8 and char8 are identical you can't write functions overloaded on int8 and 
    char8.
    
*   Variants cannot have int8 and char8 at the same time.

The problem occurs for char8 but not char16.  Curiously wchar_t is an unsigned 16 bit 
integer that's treated as a distinct type from unsigned short for the purpose of function
overloading, so ceda::char16 doesn't match any of the integer types.

A solution to this problem is to regard characters as something very distinct from 
integers, and on that basis use classes for char8 and char16 with private members for 
storing the underlying value using an integral data type.  This overcomes all the problems 
above.

Conversion functions must be explicitly used to convert between characters and integers. 

    // convert from char8 to int8
    int8 ctoi(char8)

    // convert from int8 to char8
    char8 itoc(int8)

It is expected that most programs rarely need these conversions, and when they are required
it is a good idea to make them explicit (rather than silently allowing integer arithmetic
on characters which is often nonsensical).

This frees up binary + so it can be used consistently as an associative and commutative 
concatenation operator defined on the union of all string and character values.  The 
concatenation operator would have the following signatures:

    string = char + char
    string = string + char
    string = char + string
    string = string + string
    
    string += char
    string += string

This works well if (char + char) doesn't have the interpretation of adding two integers 
together.

xstring means UTF-8
-------------------

xstring is to be regarded as a platform independent type that means UTF-8 on all builds on all
platforms.  Elements of xstrings are of type xchar and xchar is always of type ???
xstring can be safely used in persistent models, rmi interfaces etc.

xstring literals are to be provided directly in double quotes without any macros to provide a
prefix.

char, wchar_t, TCHAR are all platform dependent types so must only be used where platform 
dependent types are appropriate (such as when calling platform dependent APIs, such as Win32
API functions).

Warning : Microsoft mistakenly uses 'Unicode' and 'widechar' as synonyms for 'UCS-2' and 'UTF-16'. 

UTF-8 cannot be set as the encoding for narrow strings in the Win32 API. Therefore to support unicode
one must compile with _UNICODE rather than _MBCS.

We shall drop support for ANSI builds, so UNICODE / _UNICODE are always defined (when targeting 
MS Windows).  TCHAR always means wchar_t.  There is no reason to prefer using:
    TCHAR, "hello world",
    _tmain, _tsystem, 
    _tstoi, _tstof, _tcstod, _tcstol, _tcstoui64, _tcstoul, 
    _tfopen,
    _tcsncpy, _tcscpy, _tcslen, _tcscmp, _tcsclen
    _tgetenv_s, _tputenv_s
Note that TCHAR represents a platform dependent type that happens to correspond to wchar_t.  It will
be clearer to use these instead, when making platform dependent Windows OS calls

    wchar_t, L"hello world"
    wmain, _wsystem, 
    _wtoi, _wtof, wcstod, wcstol, _wcstoui64, wcstoul, 
    _wfopen,
    wcsncpy, wcscpy, wcslen, wcscmp, wcslen
    _wgetenv_s, _wputenv_s

On Windows, wchar_t is 16 bits and used for encoding UTF-16 strings.  Windows uses UTF-16 natively.
For example, in the following function 'lpName' is a pointer to a zero terminated UTF-16 encoded
string.

    HANDLE CreateEventW(
        LPSECURITY_ATTRIBUTES lpEventAttributes,
        BOOL bManualReset,
        BOOL bInitialState,
        LPCWSTR lpName
    );

Conversion from xstring to Windows UTF-16 strings
-------------------------------------------------

This means lossless UTF-8 to UTF-16, and can be achieved using 

    AsString16(s).c_str()
    
where s is a UTF-8 string

Some examples of how to write the code
--------------------------------------

1.  Current code use _tfopen to open a file with a name recorded in an xstring.

    We want to allow for the xstring to have non-ascii characters.  In general, when
    calling into a Win32 function we need to apply a conversion from UTF-8 to the Windows Unicode
    encoding which is UTF-16.  That allows us to call _wfopen

String headers
--------------

                      tchar.h
                        |                 
                        |
                  --  xchar.h
                 /      |     \
                /       |      \
               /        |       \
              |         |      BasicTypes.h  wchar.h
              |         |             |         /
              |         |             |        /
              |     xostream.h    VectorOfByte.h
              |         |       /
              |         |      /
              |     xvector.h
              |      /
              \     /
               \   /
              xstring.h -----------------
              /        \                  \
             /          \              StringStream.h
            /            \            /      \
    BasicSubString.h      StringExt.h         \
         |          \                          |
         |           \                         |
         |            \                        |
         |           SubString.h               |
         |                                     |
   BasicSubStringFn.h--------------------------+
*/

namespace ceda
{
#ifdef __cpp_char8_t
    // type for UTF-8 character representation, required to be large enough to represent any UTF-8 code unit (8 bits). 
    // It has the same size, signedness, and alignment as unsigned char (and therefore, the same size and alignment 
    // as char and signed char), but is a distinct type.
    using char8 = char8_t;
#else
    using char8 = char;     // signed 8 bit
#endif

// type for UTF-16 character representation, required to be large enough to represent any UTF-16 code unit (16 bits). 
// It has the same size, signedness, and alignment as std::uint_least16_t, but is a distinct type.
using char16 = char16_t;

using xchar = char8;

// Self-document cases when the string is assumed to be terminated with '\0'.
typedef const char8* ConstString8Z;         // Zero terminated UTF-8 string
typedef const char16* ConstString16Z;       // Zero terminated UTF-16 string
typedef const xchar* ConstStringZ;

inline bool IsLower(xchar c)
{
    // c is a code unit for UTF-8
    // islower(c) asserts that c = EOF or else c is between 0 and 255.   
    // Note that since xchar is signed,  calling islower on an xchar with a value
    // above 127 causes sign extension when it is converted to an int.  This 
    // trips the assertion in the CRT.
    // Therefore it is necessary to first cast to unsigned char
    return islower( (unsigned char) c) != 0;
}

inline bool IsUpper(xchar c)
{
    return isupper( (unsigned char) c) != 0;
}

inline bool IsAlpha(xchar c)
{
    return isalpha( (unsigned char) c) != 0;
}

inline bool IsDigit(xchar c)
{
    return isdigit( (unsigned char) c) != 0;
}

inline bool IsAlnum(xchar c)
{
    return isalnum( (unsigned char) c) != 0;
}

inline bool IsPrint(xchar c)
{
    return isprint( (unsigned char) c) != 0;
}

/////////////////////////////////////

inline xchar ToLower(xchar c)
{
    return (xchar) tolower( (unsigned char) c);
}

inline xchar ToUpper(xchar c)
{
    return (xchar) toupper( (unsigned char) c);
}

} // namespace ceda

#endif // include guard