Yay they updated llstring, who's ready for a full rebuild?

2019-04-10 10:04:49 -04:00
parent 5e1a102de4
commit ed88e55e04
2 changed files with 453 additions and 12 deletions
--- a/indra/llcommon/llstring.cpp
+++ b/indra/llcommon/llstring.cpp
@@ -233,7 +233,7 @@ llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len)
 		{
 			out += cur_char;
 		}
-		i++;
+		++i;
 	}
 	return out;
 }
@@ -493,7 +493,7 @@ std::string wstring_to_utf8str(const LLWString& utf32str, S32 len)
 	std::string out;
 	out.reserve(len);

-	for (S32 i = 0; i < len; i++)
+	for (S32 i = 0; i < len; ++i)
 	{
 		S32 n = wchar_to_utf8chars(utf32str[i], tchars);
 		tchars[n] = 0;
@@ -576,6 +576,78 @@ std::string utf8str_truncate(const std::string& utf8str, const S32 max_len)
 	}
 }

+// [RLVa:KB] - Checked: RLVa-2.1.0
+std::string utf8str_substr(const std::string& utf8str, const S32 index, const S32 max_len)
+{
+	if (0 == max_len)
+	{
+		return std::string();
+	}
+	if (utf8str.length() - index  <= max_len)
+	{
+		return utf8str.substr(index, max_len);
+	}
+	else
+	{
+		S32 cur_char = max_len;
+
+		// If we're ASCII, we don't need to do anything
+		if ((U8)utf8str[index + cur_char] > 0x7f)
+		{
+			// If first two bits are (10), it's the tail end of a multibyte char.  We need to shift back
+			// to the first character
+			while (0x80 == (0xc0 & utf8str[index + cur_char]))
+			{
+				cur_char--;
+				// Keep moving forward until we hit the first char;
+				if (cur_char == 0)
+				{
+					// Make sure we don't trash memory if we've got a bogus string.
+					break;
+				}
+			}
+		}
+		// The byte index we're on is one we want to get rid of, so we only want to copy up to (cur_char-1) chars
+		return utf8str.substr(index, cur_char);
+	}
+}
+
+void utf8str_split(std::list<std::string>& split_list, const std::string& utf8str, size_t maxlen, char split_token)
+{
+	split_list.clear();
+
+	std::string::size_type lenMsg = utf8str.length(), lenIt = 0;
+
+	const char* pstrIt = utf8str.c_str(); std::string strTemp;
+	while (lenIt < lenMsg)
+	{
+		if (lenIt + maxlen < lenMsg)
+		{
+			// Find the last split character
+			const char* pstrTemp = pstrIt + maxlen;
+			while ( (pstrTemp > pstrIt) && (*pstrTemp != split_token) )
+				pstrTemp--;
+
+			if (pstrTemp > pstrIt)
+				strTemp = utf8str.substr(lenIt, pstrTemp - pstrIt);
+			else
+				strTemp = utf8str_substr(utf8str, lenIt, maxlen);
+		}
+		else
+		{
+			strTemp = utf8str.substr(lenIt, std::string::npos);
+		}
+
+		split_list.push_back(strTemp);
+
+		lenIt += strTemp.length();
+		pstrIt = utf8str.c_str() + lenIt;
+		if (*pstrIt == split_token)
+			lenIt++;
+	}
+}
+// [/RLVa:KB]
+
 std::string utf8str_symbol_truncate(const std::string& utf8str, const S32 symbol_len)
 {
    if (0 == symbol_len)
@@ -668,6 +740,12 @@ bool LLStringOps::isHexString(const std::string& str)
 }

 #if LL_WINDOWS
+
+std::string ll_convert_wide_to_string(const wchar_t* in)
+{
+	return ll_convert_wide_to_string(in, CP_UTF8);
+}
+
 std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page)
 {
 	std::string out;
@@ -705,6 +783,11 @@ std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page)
 	return out;
 }

+wchar_t* ll_convert_string_to_wide(const std::string& in)
+{
+	return ll_convert_string_to_wide(in, CP_UTF8);
+}
+
 wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page)
 {
 	// From review:
@@ -726,6 +809,67 @@ wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page
 	w_out[real_output_str_len] = 0;

 	return w_out;
+	return {&w_out[0]};
+}
+
+S32 wchartchars_to_llwchar(const std::wstring::value_type* inchars, llwchar* outchar)
+{
+	const std::wstring::value_type* base = inchars;
+	std::wstring::value_type cur_char = *inchars++;
+	llwchar char32 = cur_char;
+	if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF))
+	{
+		// Surrogates
+		char32 = ((llwchar)(cur_char - 0xD800)) << 10;
+		cur_char = *inchars++;
+		char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL;
+	}
+	else
+	{
+		char32 = (llwchar)cur_char;
+	}
+	*outchar = char32;
+	return inchars - base;
+}
+
+LLWString ll_convert_wide_to_wstring(const std::wstring& in)
+{
+	LLWString wout;
+	auto len = in.size();
+	if ((len <= 0) || in.empty()) return wout;
+
+	size_t i = 0;
+	// craziness to make gcc happy (llutf16string.c_str() is tweaked on linux):
+	const std::wstring::value_type* chars16 = &(*(in.begin()));
+	while (i < len)
+	{
+		llwchar cur_char;
+		i += wchartchars_to_llwchar(chars16 + i, &cur_char);
+		wout += cur_char;
+	}
+	return wout;
+}
+
+std::wstring ll_convert_wstring_to_wide(const LLWString& in)
+{
+	std::wstring out;
+
+	size_t i = 0;
+	while (i < in.size())
+	{
+		U32 cur_char = in[i];
+		if (cur_char > 0xFFFF)
+		{
+			out += (0xD7C0 + (cur_char >> 10));
+			out += (0xDC00 | (cur_char & 0x3FF));
+		}
+		else
+		{
+			out += cur_char;
+		}
+		i++;
+	}
+	return out;
 }

 std::string ll_convert_string_to_utf8_string(const std::string& in)
@@ -736,7 +880,108 @@ std::string ll_convert_string_to_utf8_string(const std::string& in)

 	return out_utf8;
 }
-#endif // LL_WINDOWS
+
+namespace
+{
+
+void HeapFree_deleter(void* ptr)
+{
+    // instead of LocalFree(), per https://stackoverflow.com/a/31541205
+    HeapFree(GetProcessHeap(), NULL, ptr);
+}
+
+} // anonymous namespace
+
+template<>
+std::wstring windows_message<std::wstring>(DWORD error)
+{
+    // derived from https://stackoverflow.com/a/455533
+    wchar_t* rawptr = nullptr;
+    auto okay = FormatMessageW(
+        // use system message tables for GetLastError() codes
+        FORMAT_MESSAGE_FROM_SYSTEM |
+        // internally allocate buffer and return its pointer
+        FORMAT_MESSAGE_ALLOCATE_BUFFER |
+        // you cannot pass insertion parameters (thanks Gandalf)
+        FORMAT_MESSAGE_IGNORE_INSERTS |
+        // ignore line breaks in message definition text
+        FORMAT_MESSAGE_MAX_WIDTH_MASK,
+        NULL,                       // lpSource, unused with FORMAT_MESSAGE_FROM_SYSTEM
+        error,                      // dwMessageId
+        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // dwLanguageId
+        (LPWSTR)&rawptr,         // lpBuffer: force-cast wchar_t** to wchar_t*
+        0,                // nSize, unused with FORMAT_MESSAGE_ALLOCATE_BUFFER
+        NULL);            // Arguments, unused
+
+    // make a unique_ptr from rawptr so it gets cleaned up properly
+    std::unique_ptr<wchar_t, void(*)(void*)> bufferptr(rawptr, HeapFree_deleter);
+
+    if (okay && bufferptr)
+    {
+        // got the message, return it ('okay' is length in characters)
+        return { bufferptr.get(), okay };
+    }
+
+    // did not get the message, synthesize one
+    auto format_message_error = GetLastError();
+    std::wostringstream out;
+    out << L"GetLastError() " << error << L" (FormatMessageW() failed with "
+        << format_message_error << L")";
+    return out.str();
+}
+
+boost::optional<std::wstring> llstring_getoptenv(const std::string& key)
+{
+    auto wkey = ll_convert_string_to_wide(key);
+    // Take a wild guess as to how big the buffer should be.
+    std::vector<wchar_t> buffer(1024);
+    auto n = GetEnvironmentVariableW(wkey, &buffer[0], buffer.size());
+    // If our initial guess was too short, n will indicate the size (in
+    // wchar_t's) that buffer should have been, including the terminating nul.
+    if (n > (buffer.size() - 1))
+    {
+        // make it big enough
+        buffer.resize(n);
+        // and try again
+        n = GetEnvironmentVariableW(wkey, &buffer[0], buffer.size());
+    }
+    // did that (ultimately) succeed?
+    if (n)
+    {
+        // great, return populated boost::optional
+        return boost::optional<std::wstring>(&buffer[0]);
+    }
+
+    // not successful
+    auto last_error = GetLastError();
+    // Don't bother warning for NOT_FOUND; that's an expected case
+    if (last_error != ERROR_ENVVAR_NOT_FOUND)
+    {
+        LL_WARNS() << "GetEnvironmentVariableW('" << key << "') failed: "
+                   << windows_message<std::string>(last_error) << LL_ENDL;
+    }
+    // return empty boost::optional
+    return {};
+}
+
+#else  // ! LL_WINDOWS
+
+boost::optional<std::string> llstring_getoptenv(const std::string& key)
+{
+    auto found = getenv(key.c_str());
+    if (found)
+    {
+        // return populated boost::optional
+        return boost::optional<std::string>(found);
+    }
+    else
+    {
+        // return empty boost::optional
+        return {};
+    }
+}
+
+#endif // ! LL_WINDOWS

 long LLStringOps::sPacificTimeOffset = 0;
 long LLStringOps::sLocalTimeOffset = 0;
--- a/indra/llcommon/llstring.h
+++ b/indra/llcommon/llstring.h
@@ -27,6 +27,9 @@
 #ifndef LL_LLSTRING_H
 #define LL_LLSTRING_H

+#include "llwin32headerslean.h"
+
+#include <boost/optional/optional.hpp>
 #include <string>
 #include <cstdio>
 //#include <locale>
@@ -34,8 +37,12 @@
 #include <algorithm>
 #include <vector>
 #include <map>
-#include "llsd.h"
 #include "llfasttimer.h"
+#include "llformat.h"
+#include "llsd.h"
+// [RLVa:KB] - Checked: RLVa-2.1.0
+#include <list>
+// [/RLVa:KB]

 #if LL_LINUX || LL_SOLARIS
 #include <wctype.h>
@@ -305,7 +312,7 @@ public:
 	
 	static bool isValidIndex(const string_type& string, size_type i)
 	{
-		return !string.empty() && (i <= string.size());
+		return !string.empty() && (0 <= i) && (i <= string.size());
 	}

 	static bool contains(const string_type& string, T c, size_type i=0)
@@ -343,6 +350,19 @@ public:
 		const string_type& string,
 		const string_type& substr);

+	/**
+	 * get environment string value with proper Unicode handling
+	 * (key is always UTF-8)
+	 * detect absence by return value == dflt
+	 */
+	static string_type getenv(const std::string& key, const string_type& dflt="");
+	/**
+	 * get optional environment string value with proper Unicode handling
+	 * (key is always UTF-8)
+	 * detect absence by (! return value)
+	 */
+	static boost::optional<string_type> getoptenv(const std::string& key);
+
 	static void	addCRLF(string_type& string);
 	static void	removeCRLF(string_type& string);
 	static void removeWindowsCR(string_type& string);
@@ -503,6 +523,37 @@ LL_COMMON_API bool iswindividual(llwchar elem);
 * Unicode support
 */

+/// generic conversion aliases
+template<typename TO, typename FROM, typename Enable=void>
+struct ll_convert_impl
+{
+    // Don't even provide a generic implementation. We specialize for every
+    // combination we do support.
+    TO operator()(const FROM& in) const;
+};
+
+// Use a function template to get the nice ll_convert<TO>(from_value) API.
+template<typename TO, typename FROM>
+TO ll_convert(const FROM& in)
+{
+    return ll_convert_impl<TO, FROM>()(in);
+}
+
+// degenerate case
+template<typename T>
+struct ll_convert_impl<T, T>
+{
+    T operator()(const T& in) const { return in; }
+};
+
+// specialize ll_convert_impl<TO, FROM> to return EXPR
+#define ll_convert_alias(TO, FROM, EXPR)                    \
+template<>                                                  \
+struct ll_convert_impl<TO, FROM>                            \
+{                                                           \
+    TO operator()(const FROM& in) const { return EXPR; }    \
+}
+
 // Make the incoming string a utf8 string. Replaces any unknown glyph
 // with the UNKNOWN_CHARACTER. Once any unknown glyph is found, the rest
 // of the data may not be recovered.
@@ -510,37 +561,91 @@ LL_COMMON_API std::string rawstr_to_utf8(const std::string& raw);

 //
 // We should never use UTF16 except when communicating with Win32!
-//
+// https://docs.microsoft.com/en-us/cpp/cpp/char-wchar-t-char16-t-char32-t
+// nat 2018-12-14: I consider the whole llutf16string thing a mistake, because
+// the Windows APIs we want to call are all defined in terms of wchar_t*
+// (or worse, LPCTSTR).
+// https://docs.microsoft.com/en-us/windows/desktop/winprog/windows-data-types

+// While there is no point coding for an ASCII-only world (! defined(UNICODE)),
+// use of U16 and llutf16string for Windows APIs locks in /Zc:wchar_t-. Going
+// forward, we should code in terms of wchar_t and std::wstring so as to
+// support either setting of /Zc:wchar_t.
+
+// The first link above states that char can be used to hold ASCII or any
+// multi-byte character set, and distinguishes wchar_t (UTF-16LE), char16_t
+// (UTF-16) and char32_t (UTF-32). Nonetheless, within this code base:
+// * char and std::string always hold UTF-8 (of which ASCII is a subset). It
+//   is a BUG if they are used to pass strings in any other multi-byte
+//   encoding.
+// * wchar_t and std::wstring should be our interface to Windows wide-string
+//   APIs, and therefore hold UTF-16LE.
+// * U16 and llutf16string are the previous but DEPRECATED UTF-16LE type. Do
+//   not introduce new uses of U16 or llutf16string for string data.
+// * llwchar and LLWString hold UTF-32 strings.
+// * Do not introduce char16_t or std::u16string.
+// * Do not introduce char32_t or std::u32string.
+//
 #if _WIN32 && _NATIVE_WCHAR_T_DEFINED
 typedef wchar_t utf16strtype;
 #else
 typedef U16 utf16strtype;
 #endif
-
 typedef std::basic_string<utf16strtype> llutf16string;

+#if ! defined(LL_WCHAR_T_NATIVE)
+// wchar_t is identical to U16, and std::wstring is identical to llutf16string.
+// Defining an ll_convert alias involving llutf16string would collide with the
+// comparable preferred alias involving std::wstring. (In this scenario, if
+// you pass llutf16string, it will engage the std::wstring specialization.)
+#define ll_convert_u16_alias(TO, FROM, EXPR) // nothing
+#else  // defined(LL_WCHAR_T_NATIVE)
+// wchar_t is a distinct native type, so llutf16string is also a distinct
+// type, and there IS a point to converting separately to/from llutf16string.
+// (But why? Windows APIs are still defined in terms of wchar_t, and
+// in this scenario llutf16string won't work for them!)
+#define ll_convert_u16_alias(TO, FROM, EXPR) ll_convert_alias(TO, FROM, EXPR)
+
+#if LL_WINDOWS
+// LL_WCHAR_T_NATIVE is defined on non-Windows systems because, in fact,
+// wchar_t is native. Everywhere but Windows, we use it for llwchar (see
+// stdtypes.h). That makes LLWString identical to std::wstring, so these
+// aliases for std::wstring would collide with those for LLWString. Only
+// define on Windows, where converting between std::wstring and llutf16string
+// means copying chars.
+ll_convert_alias(llutf16string, std::wstring, llutf16string(in.begin(), in.end()));
+ll_convert_alias(std::wstring, llutf16string,  std::wstring(in.begin(), in.end()));
+#endif // LL_WINDOWS
+#endif // defined(LL_WCHAR_T_NATIVE)
+
 LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str, S32 len);
 LL_COMMON_API LLWString utf16str_to_wstring(const llutf16string &utf16str);
+ll_convert_u16_alias(LLWString, llutf16string, utf16str_to_wstring(in));

 LL_COMMON_API llutf16string wstring_to_utf16str(const LLWString &utf32str, S32 len);
 LL_COMMON_API llutf16string wstring_to_utf16str(const LLWString &utf32str);
+ll_convert_u16_alias(llutf16string, LLWString, wstring_to_utf16str(in));

 LL_COMMON_API llutf16string utf8str_to_utf16str ( const std::string& utf8str, S32 len);
 LL_COMMON_API llutf16string utf8str_to_utf16str ( const std::string& utf8str );
+ll_convert_u16_alias(llutf16string, std::string, utf8str_to_utf16str(in));

 LL_COMMON_API LLWString utf8str_to_wstring(const std::string &utf8str, S32 len);
 LL_COMMON_API LLWString utf8str_to_wstring(const std::string &utf8str);
 // Same function, better name. JC
 inline LLWString utf8string_to_wstring(const std::string& utf8_string) { return utf8str_to_wstring(utf8_string); }
+// best name of all
+ll_convert_alias(LLWString, std::string, utf8string_to_wstring(in));

 //
 LL_COMMON_API S32 wchar_to_utf8chars(llwchar inchar, char* outchars);

 LL_COMMON_API std::string wstring_to_utf8str(const LLWString &utf32str, S32 len);
 LL_COMMON_API std::string wstring_to_utf8str(const LLWString &utf32str);
+ll_convert_alias(std::string, LLWString, wstring_to_utf8str(in));
 LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str, S32 len);
 LL_COMMON_API std::string utf16str_to_utf8str(const llutf16string &utf16str);
+ll_convert_u16_alias(std::string, llutf16string, utf16str_to_utf8str(in));

 #if LL_WINDOWS
 inline std::string wstring_to_utf8str(const llutf16string &utf16str) { return utf16str_to_utf8str(utf16str);}
@@ -575,6 +680,11 @@ LL_COMMON_API S32 wstring_wstring_length_from_utf16_length(const LLWString & wst
 */
 LL_COMMON_API std::string utf8str_truncate(const std::string& utf8str, const S32 max_len);

+// [RLVa:KB] - Checked: RLVa-2.1.0
+LL_COMMON_API std::string utf8str_substr(const std::string& utf8str, const S32 index, const S32 max_len);
+LL_COMMON_API void utf8str_split(std::list<std::string>& split_list, const std::string& utf8str, size_t maxlen, char split_token);
+// [/RLVa:KB]
+
 LL_COMMON_API std::string utf8str_trim(const std::string& utf8str);

 LL_COMMON_API S32 utf8str_compare_insensitive(
@@ -623,22 +733,77 @@ LL_COMMON_API std::string utf8str_removeCRLF(const std::string& utf8str);
 * This replaces the unsafe W2A macro from ATL.
 */
 LL_COMMON_API std::string ll_convert_wide_to_string(const wchar_t* in, unsigned int code_page);
+LL_COMMON_API std::string ll_convert_wide_to_string(const wchar_t* in); // default CP_UTF8
+inline std::string ll_convert_wide_to_string(const std::wstring& in, unsigned int code_page)
+{
+    return ll_convert_wide_to_string(in.c_str(), code_page);
+}
+inline std::string ll_convert_wide_to_string(const std::wstring& in)
+{
+    return ll_convert_wide_to_string(in.c_str());
+}
+ll_convert_alias(std::string, std::wstring, ll_convert_wide_to_string(in));

 /**
 * Converts a string to wide string.
- *
- * It will allocate memory for result string with "new []". Don't forget to release it with "delete []".
 */
-LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in, unsigned int code_page);
+LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in,
+                                                     unsigned int code_page);
+LL_COMMON_API wchar_t* ll_convert_string_to_wide(const std::string& in);
+                                                     // default CP_UTF8
+ll_convert_alias(wchar_t*, std::string, ll_convert_string_to_wide(in));

 /**
- * Converts incoming string into urf8 string
+ * Convert a Windows wide string to our LLWString
+ */
+LL_COMMON_API LLWString ll_convert_wide_to_wstring(const std::wstring& in);
+ll_convert_alias(LLWString, std::wstring, ll_convert_wide_to_wstring(in));
+
+/**
+ * Convert LLWString to Windows wide string
+ */
+LL_COMMON_API std::wstring ll_convert_wstring_to_wide(const LLWString& in);
+ll_convert_alias(std::wstring, LLWString, ll_convert_wstring_to_wide(in));
+
+/**
+ * Converts incoming string into utf8 string
 *
 */
 LL_COMMON_API std::string ll_convert_string_to_utf8_string(const std::string& in);

+/// Get Windows message string for passed GetLastError() code
+// VS 2013 doesn't let us forward-declare this template, which is what we
+// started with, so the implementation could reference the specialization we
+// haven't yet declared. Somewhat weirdly, just stating the generic
+// implementation in terms of the specialization works, even in this order...
+
+// the general case is just a conversion from the sole implementation
+// Microsoft says DWORD is a typedef for unsigned long
+// https://docs.microsoft.com/en-us/windows/desktop/winprog/windows-data-types
+// so rather than drag windows.h into everybody's include space...
+template<typename STRING>
+STRING windows_message(unsigned long error)
+{
+    return ll_convert<STRING>(windows_message<std::wstring>(error));
+}
+
+/// There's only one real implementation
+template<>
+LL_COMMON_API std::wstring windows_message<std::wstring>(unsigned long error);
+
+/// Get Windows message string, implicitly calling GetLastError()
+template<typename STRING>
+STRING windows_message() { return windows_message<STRING>(GetLastError()); }
+
 //@}
-#endif // LL_WINDOWS
+
+LL_COMMON_API boost::optional<std::wstring> llstring_getoptenv(const std::string& key);
+
+#else // ! LL_WINDOWS
+
+LL_COMMON_API boost::optional<std::string>  llstring_getoptenv(const std::string& key);
+
+#endif // ! LL_WINDOWS

 /**
 * Many of the 'strip' and 'replace' methods of LLStringUtilBase need
@@ -1612,6 +1777,37 @@ bool LLStringUtilBase<T>::endsWith(
 	return (idx == (string.size() - substr.size()));
 }

+// static
+template<class T>
+auto LLStringUtilBase<T>::getoptenv(const std::string& key) -> boost::optional<string_type>
+{
+    auto found(llstring_getoptenv(key));
+    if (found)
+    {
+        // return populated boost::optional
+        return { ll_convert<string_type>(*found) };
+    }
+    else
+    {
+        // empty boost::optional
+        return {};
+    }
+}
+
+// static
+template<class T>
+auto LLStringUtilBase<T>::getenv(const std::string& key, const string_type& dflt) -> string_type
+{
+    auto found(getoptenv(key));
+    if (found)
+    {
+        return *found;
+    }
+    else
+    {
+        return dflt;
+    }
+}

 template<class T> 
 BOOL LLStringUtilBase<T>::convertToBOOL(const string_type& string, BOOL& value)