diff --git a/indra/llcommon/llstring.cpp b/indra/llcommon/llstring.cpp index 7203b4d6e..4e8649503 100644 --- a/indra/llcommon/llstring.cpp +++ b/indra/llcommon/llstring.cpp @@ -233,7 +233,7 @@ llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len) { out += cur_char; } - i++; + ++i; } return out; } @@ -493,7 +493,7 @@ std::string wstring_to_utf8str(const LLWString& utf32str, S32 len) std::string out; out.reserve(len); - for (S32 i = 0; i < len; i++) + for (S32 i = 0; i < len; ++i) { S32 n = wchar_to_utf8chars(utf32str[i], tchars); tchars[n] = 0; @@ -576,6 +576,78 @@ std::string utf8str_truncate(const std::string& utf8str, const S32 max_len) } } +// [RLVa:KB] - Checked: RLVa-2.1.0 +std::string utf8str_substr(const std::string& utf8str, const S32 index, const S32 max_len) +{ + if (0 == max_len) + { + return std::string(); + } + if (utf8str.length() - index <= max_len) + { + return utf8str.substr(index, max_len); + } + else + { + S32 cur_char = max_len; + + // If we're ASCII, we don't need to do anything + if ((U8)utf8str[index + cur_char] > 0x7f) + { + // If first two bits are (10), it's the tail end of a multibyte char. We need to shift back + // to the first character + while (0x80 == (0xc0 & utf8str[index + cur_char])) + { + cur_char--; + // Keep moving forward until we hit the first char; + if (cur_char == 0) + { + // Make sure we don't trash memory if we've got a bogus string. + break; + } + } + } + // The byte index we're on is one we want to get rid of, so we only want to copy up to (cur_char-1) chars + return utf8str.substr(index, cur_char); + } +} + +void utf8str_split(std::list& split_list, const std::string& utf8str, size_t maxlen, char split_token) +{ + split_list.clear(); + + std::string::size_type lenMsg = utf8str.length(), lenIt = 0; + + const char* pstrIt = utf8str.c_str(); std::string strTemp; + while (lenIt < lenMsg) + { + if (lenIt + maxlen < lenMsg) + { + // Find the last split character + const char* pstrTemp = pstrIt + maxlen; + while ( (pstrTemp > pstrIt) && (*pstrTemp != split_token) ) + pstrTemp--; + + if (pstrTemp > pstrIt) + strTemp = utf8str.substr(lenIt, pstrTemp - pstrIt); + else + strTemp = utf8str_substr(utf8str, lenIt, maxlen); + } + else + { + strTemp = utf8str.substr(lenIt, std::string::npos); + } + + split_list.push_back(strTemp); + + lenIt += strTemp.length(); + pstrIt = utf8str.c_str() + lenIt; + if (*pstrIt == split_token) + lenIt++; + } +} +// [/RLVa:KB] + std::string utf8str_symbol_truncate(const std::string& utf8str, const S32 symbol_len) { if (0 == symbol_len) @@ -668,6 +740,12 @@ bool LLStringOps::isHexString(const std::string& str) } #if LL_WINDOWS + +std::string ll_convert_wide_to_string(const wchar_t* in) +{ + return ll_convert_wide_to_string(in, CP_UTF8); +} + std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page) { std::string out; @@ -705,6 +783,11 @@ std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page) return out; } +wchar_t* ll_convert_string_to_wide(const std::string& in) +{ + return ll_convert_string_to_wide(in, CP_UTF8); +} + wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page) { // From review: @@ -726,6 +809,67 @@ wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page w_out[real_output_str_len] = 0; return w_out; + return {&w_out[0]}; +} + +S32 wchartchars_to_llwchar(const std::wstring::value_type* inchars, llwchar* outchar) +{ + const std::wstring::value_type* base = inchars; + std::wstring::value_type cur_char = *inchars++; + llwchar char32 = cur_char; + if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF)) + { + // Surrogates + char32 = ((llwchar)(cur_char - 0xD800)) << 10; + cur_char = *inchars++; + char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL; + } + else + { + char32 = (llwchar)cur_char; + } + *outchar = char32; + return inchars - base; +} + +LLWString ll_convert_wide_to_wstring(const std::wstring& in) +{ + LLWString wout; + auto len = in.size(); + if ((len <= 0) || in.empty()) return wout; + + size_t i = 0; + // craziness to make gcc happy (llutf16string.c_str() is tweaked on linux): + const std::wstring::value_type* chars16 = &(*(in.begin())); + while (i < len) + { + llwchar cur_char; + i += wchartchars_to_llwchar(chars16 + i, &cur_char); + wout += cur_char; + } + return wout; +} + +std::wstring ll_convert_wstring_to_wide(const LLWString& in) +{ + std::wstring out; + + size_t i = 0; + while (i < in.size()) + { + U32 cur_char = in[i]; + if (cur_char > 0xFFFF) + { + out += (0xD7C0 + (cur_char >> 10)); + out += (0xDC00 | (cur_char & 0x3FF)); + } + else + { + out += cur_char; + } + i++; + } + return out; } std::string ll_convert_string_to_utf8_string(const std::string& in) @@ -736,7 +880,108 @@ std::string ll_convert_string_to_utf8_string(const std::string& in) return out_utf8; } -#endif // LL_WINDOWS + +namespace +{ + +void HeapFree_deleter(void* ptr) +{ + // instead of LocalFree(), per https://stackoverflow.com/a/31541205 + HeapFree(GetProcessHeap(), NULL, ptr); +} + +} // anonymous namespace + +template<> +std::wstring windows_message(DWORD error) +{ + // derived from https://stackoverflow.com/a/455533 + wchar_t* rawptr = nullptr; + auto okay = FormatMessageW( + // use system message tables for GetLastError() codes + FORMAT_MESSAGE_FROM_SYSTEM | + // internally allocate buffer and return its pointer + FORMAT_MESSAGE_ALLOCATE_BUFFER | + // you cannot pass insertion parameters (thanks Gandalf) + FORMAT_MESSAGE_IGNORE_INSERTS | + // ignore line breaks in message definition text + FORMAT_MESSAGE_MAX_WIDTH_MASK, + NULL, // lpSource, unused with FORMAT_MESSAGE_FROM_SYSTEM + error, // dwMessageId + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // dwLanguageId + (LPWSTR)&rawptr, // lpBuffer: force-cast wchar_t** to wchar_t* + 0, // nSize, unused with FORMAT_MESSAGE_ALLOCATE_BUFFER + NULL); // Arguments, unused + + // make a unique_ptr from rawptr so it gets cleaned up properly + std::unique_ptr bufferptr(rawptr, HeapFree_deleter); + + if (okay && bufferptr) + { + // got the message, return it ('okay' is length in characters) + return { bufferptr.get(), okay }; + } + + // did not get the message, synthesize one + auto format_message_error = GetLastError(); + std::wostringstream out; + out << L"GetLastError() " << error << L" (FormatMessageW() failed with " + << format_message_error << L")"; + return out.str(); +} + +boost::optional llstring_getoptenv(const std::string& key) +{ + auto wkey = ll_convert_string_to_wide(key); + // Take a wild guess as to how big the buffer should be. + std::vector buffer(1024); + auto n = GetEnvironmentVariableW(wkey, &buffer[0], buffer.size()); + // If our initial guess was too short, n will indicate the size (in + // wchar_t's) that buffer should have been, including the terminating nul. + if (n > (buffer.size() - 1)) + { + // make it big enough + buffer.resize(n); + // and try again + n = GetEnvironmentVariableW(wkey, &buffer[0], buffer.size()); + } + // did that (ultimately) succeed? + if (n) + { + // great, return populated boost::optional + return boost::optional(&buffer[0]); + } + + // not successful + auto last_error = GetLastError(); + // Don't bother warning for NOT_FOUND; that's an expected case + if (last_error != ERROR_ENVVAR_NOT_FOUND) + { + LL_WARNS() << "GetEnvironmentVariableW('" << key << "') failed: " + << windows_message(last_error) << LL_ENDL; + } + // return empty boost::optional + return {}; +} + +#else // ! LL_WINDOWS + +boost::optional llstring_getoptenv(const std::string& key) +{ + auto found = getenv(key.c_str()); + if (found) + { + // return populated boost::optional + return boost::optional(found); + } + else + { + // return empty boost::optional + return {}; + } +} + +#endif // ! LL_WINDOWS long LLStringOps::sPacificTimeOffset = 0; long LLStringOps::sLocalTimeOffset = 0; diff --git a/indra/llcommon/llstring.h b/indra/llcommon/llstring.h index 35407ca4e..e1eb06f2b 100644 --- a/indra/llcommon/llstring.h +++ b/indra/llcommon/llstring.h @@ -27,6 +27,9 @@ #ifndef LL_LLSTRING_H #define LL_LLSTRING_H +#include "llwin32headerslean.h" + +#include #include #include //#include @@ -34,8 +37,12 @@ #include #include #include -#include "llsd.h" #include "llfasttimer.h" +#include "llformat.h" +#include "llsd.h" +// [RLVa:KB] - Checked: RLVa-2.1.0 +#include +// [/RLVa:KB] #if LL_LINUX || LL_SOLARIS #include @@ -305,7 +312,7 @@ public: static bool isValidIndex(const string_type& string, size_type i) { - return !string.empty() && (i <= string.size()); + return !string.empty() && (0 <= i) && (i <= string.size()); } static bool contains(const string_type& string, T c, size_type i=0) @@ -343,6 +350,19 @@ public: const string_type& string, const string_type& substr); + /** + * get environment string value with proper Unicode handling + * (key is always UTF-8) + * detect absence by return value == dflt + */ + static string_type getenv(const std::string& key, const string_type& dflt=""); + /** + * get optional environment string value with proper Unicode handling + * (key is always UTF-8) + * detect absence by (! return value) + */ + static boost::optional getoptenv(const std::string& key); + static void addCRLF(string_type& string); static void removeCRLF(string_type& string); static void removeWindowsCR(string_type& string); @@ -503,6 +523,37 @@ LL_COMMON_API bool iswindividual(llwchar elem); * Unicode support */ +/// generic conversion aliases +template +struct ll_convert_impl +{ + // Don't even provide a generic implementation. We specialize for every + // combination we do support. + TO operator()(const FROM& in) const; +}; + +// Use a function template to get the nice ll_convert(from_value) API. +template +TO ll_convert(const FROM& in) +{ + return ll_convert_impl()(in); +} + +// degenerate case +template +struct ll_convert_impl +{ + T operator()(const T& in) const { return in; } +}; + +// specialize ll_convert_impl to return EXPR +#define ll_convert_alias(TO, FROM, EXPR) \ +template<> \ +struct ll_convert_impl \ +{ \ + TO operator()(const FROM& in) const { return EXPR; } \ +} + // Make the incoming string a utf8 string. Replaces any unknown glyph // with the UNKNOWN_CHARACTER. Once any unknown glyph is found, the rest // of the data may not be recovered. @@ -510,37 +561,91 @@ LL_COMMON_API std::string rawstr_to_utf8(const std::string& raw); // // We should never use UTF16 except when communicating with Win32! -// +// https://docs.microsoft.com/en-us/cpp/cpp/char-wchar-t-char16-t-char32-t +// nat 2018-12-14: I consider the whole llutf16string thing a mistake, because +// the Windows APIs we want to call are all defined in terms of wchar_t* +// (or worse, LPCTSTR). +// https://docs.microsoft.com/en-us/windows/desktop/winprog/windows-data-types +// While there is no point coding for an ASCII-only world (! defined(UNICODE)), +// use of U16 and llutf16string for Windows APIs locks in /Zc:wchar_t-. Going +// forward, we should code in terms of wchar_t and std::wstring so as to +// support either setting of /Zc:wchar_t. + +// The first link above states that char can be used to hold ASCII or any +// multi-byte character set, and distinguishes wchar_t (UTF-16LE), char16_t +// (UTF-16) and char32_t (UTF-32). Nonetheless, within this code base: +// * char and std::string always hold UTF-8 (of which ASCII is a subset). It +// is a BUG if they are used to pass strings in any other multi-byte +// encoding. +// * wchar_t and std::wstring should be our interface to Windows wide-string +// APIs, and therefore hold UTF-16LE. +// * U16 and llutf16string are the previous but DEPRECATED UTF-16LE type. Do +// not introduce new uses of U16 or llutf16string for string data. +// * llwchar and LLWString hold UTF-32 strings. +// * Do not introduce char16_t or std::u16string. +// * Do not introduce char32_t or std::u32string. +// #if _WIN32 && _NATIVE_WCHAR_T_DEFINED typedef wchar_t utf16strtype; #else typedef U16 utf16strtype; #endif - typedef std::basic_string llutf16string; +#if ! defined(LL_WCHAR_T_NATIVE) +// wchar_t is identical to U16, and std::wstring is identical to llutf16string. +// Defining an ll_convert alias involving llutf16string would collide with the +// comparable preferred alias involving std::wstring. (In this scenario, if +// you pass llutf16string, it will engage the std::wstring specialization.) +#define ll_convert_u16_alias(TO, FROM, EXPR) // nothing +#else // defined(LL_WCHAR_T_NATIVE) +// wchar_t is a distinct native type, so llutf16string is also a distinct +// type, and there IS a point to converting separately to/from llutf16string. +// (But why? Windows APIs are still defined in terms of wchar_t, and +// in this scenario llutf16string won't work for them!) +#define ll_convert_u16_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR) + +#if LL_WINDOWS +// LL_WCHAR_T_NATIVE is defined on non-Windows systems because, in fact, +// wchar_t is native. Everywhere but Windows, we use it for llwchar (see +// stdtypes.h). That makes LLWString identical to std::wstring, so these +// aliases for std::wstring would collide with those for LLWString. Only +// define on Windows, where converting between std::wstring and llutf16string +// means copying chars. +ll_convert_alias(llutf16string, std::wstring, llutf16string(in.begin(), in.end())); +ll_convert_alias(std::wstring, llutf16string, std::wstring(in.begin(), in.end())); +#endif // LL_WINDOWS +#endif // defined(LL_WCHAR_T_NATIVE) + LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len); LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str); +ll_convert_u16_alias(LLWString, llutf16string, utf16str_to_wstring(in)); LL_COMMON_API llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len); LL_COMMON_API llutf16string wstring_to_utf16str(const LLWString &utf32str); +ll_convert_u16_alias(llutf16string, LLWString, wstring_to_utf16str(in)); LL_COMMON_API llutf16string utf8str_to_utf16str ( const std::string& utf8str, S32 len); LL_COMMON_API llutf16string utf8str_to_utf16str ( const std::string& utf8str ); +ll_convert_u16_alias(llutf16string, std::string, utf8str_to_utf16str(in)); LL_COMMON_API LLWString utf8str_to_wstring(const std::string &utf8str, S32 len); LL_COMMON_API LLWString utf8str_to_wstring(const std::string &utf8str); // Same function, better name. JC inline LLWString utf8string_to_wstring(const std::string& utf8_string) { return utf8str_to_wstring(utf8_string); } +// best name of all +ll_convert_alias(LLWString, std::string, utf8string_to_wstring(in)); // LL_COMMON_API S32 wchar_to_utf8chars(llwchar inchar, char* outchars); LL_COMMON_API std::string wstring_to_utf8str(const LLWString &utf32str, S32 len); LL_COMMON_API std::string wstring_to_utf8str(const LLWString &utf32str); +ll_convert_alias(std::string, LLWString, wstring_to_utf8str(in)); LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str, S32 len); LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str); +ll_convert_u16_alias(std::string, llutf16string, utf16str_to_utf8str(in)); #if LL_WINDOWS inline std::string wstring_to_utf8str(const llutf16string &utf16str) { return utf16str_to_utf8str(utf16str);} @@ -575,6 +680,11 @@ LL_COMMON_API S32 wstring_wstring_length_from_utf16_length(const LLWString & wst */ LL_COMMON_API std::string utf8str_truncate(const std::string& utf8str, const S32 max_len); +// [RLVa:KB] - Checked: RLVa-2.1.0 +LL_COMMON_API std::string utf8str_substr(const std::string& utf8str, const S32 index, const S32 max_len); +LL_COMMON_API void utf8str_split(std::list& split_list, const std::string& utf8str, size_t maxlen, char split_token); +// [/RLVa:KB] + LL_COMMON_API std::string utf8str_trim(const std::string& utf8str); LL_COMMON_API S32 utf8str_compare_insensitive( @@ -623,22 +733,77 @@ LL_COMMON_API std::string utf8str_removeCRLF(const std::string& utf8str); * This replaces the unsafe W2A macro from ATL. */ LL_COMMON_API std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page); +LL_COMMON_API std::string ll_convert_wide_to_string(const wchar_t* in); // default CP_UTF8 +inline std::string ll_convert_wide_to_string(const std::wstring& in, unsigned int code_page) +{ + return ll_convert_wide_to_string(in.c_str(), code_page); +} +inline std::string ll_convert_wide_to_string(const std::wstring& in) +{ + return ll_convert_wide_to_string(in.c_str()); +} +ll_convert_alias(std::string, std::wstring, ll_convert_wide_to_string(in)); /** * Converts a string to wide string. - * - * It will allocate memory for result string with "new []". Don't forget to release it with "delete []". */ -LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page); +LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in, + unsigned int code_page); +LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in); + // default CP_UTF8 +ll_convert_alias(wchar_t*, std::string, ll_convert_string_to_wide(in)); /** - * Converts incoming string into urf8 string + * Convert a Windows wide string to our LLWString + */ +LL_COMMON_API LLWString ll_convert_wide_to_wstring(const std::wstring& in); +ll_convert_alias(LLWString, std::wstring, ll_convert_wide_to_wstring(in)); + +/** + * Convert LLWString to Windows wide string + */ +LL_COMMON_API std::wstring ll_convert_wstring_to_wide(const LLWString& in); +ll_convert_alias(std::wstring, LLWString, ll_convert_wstring_to_wide(in)); + +/** + * Converts incoming string into utf8 string * */ LL_COMMON_API std::string ll_convert_string_to_utf8_string(const std::string& in); +/// Get Windows message string for passed GetLastError() code +// VS 2013 doesn't let us forward-declare this template, which is what we +// started with, so the implementation could reference the specialization we +// haven't yet declared. Somewhat weirdly, just stating the generic +// implementation in terms of the specialization works, even in this order... + +// the general case is just a conversion from the sole implementation +// Microsoft says DWORD is a typedef for unsigned long +// https://docs.microsoft.com/en-us/windows/desktop/winprog/windows-data-types +// so rather than drag windows.h into everybody's include space... +template +STRING windows_message(unsigned long error) +{ + return ll_convert(windows_message(error)); +} + +/// There's only one real implementation +template<> +LL_COMMON_API std::wstring windows_message(unsigned long error); + +/// Get Windows message string, implicitly calling GetLastError() +template +STRING windows_message() { return windows_message(GetLastError()); } + //@} -#endif // LL_WINDOWS + +LL_COMMON_API boost::optional llstring_getoptenv(const std::string& key); + +#else // ! LL_WINDOWS + +LL_COMMON_API boost::optional llstring_getoptenv(const std::string& key); + +#endif // ! LL_WINDOWS /** * Many of the 'strip' and 'replace' methods of LLStringUtilBase need @@ -1612,6 +1777,37 @@ bool LLStringUtilBase::endsWith( return (idx == (string.size() - substr.size())); } +// static +template +auto LLStringUtilBase::getoptenv(const std::string& key) -> boost::optional +{ + auto found(llstring_getoptenv(key)); + if (found) + { + // return populated boost::optional + return { ll_convert(*found) }; + } + else + { + // empty boost::optional + return {}; + } +} + +// static +template +auto LLStringUtilBase::getenv(const std::string& key, const string_type& dflt) -> string_type +{ + auto found(getoptenv(key)); + if (found) + { + return *found; + } + else + { + return dflt; + } +} template BOOL LLStringUtilBase::convertToBOOL(const string_type& string, BOOL& value)