diff options
Diffstat (limited to 'src/common/string_util.cpp')
-rw-r--r-- | src/common/string_util.cpp | 190 |
1 files changed, 126 insertions, 64 deletions
diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 61f0939c4..6d9612fb5 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -2,13 +2,14 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -#include <algorithm> +#include <boost/range/algorithm.hpp> #include "common/common.h" #include "common/string_util.h" #ifdef _WIN32 #include <Windows.h> + #include <codecvt> #else #include <iconv.h> #endif @@ -17,20 +18,20 @@ namespace Common { /// Make a string lowercase std::string ToLower(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::tolower); + boost::transform(str, str.begin(), ::tolower); return str; } /// Make a string uppercase std::string ToUpper(std::string str) { - std::transform(str.begin(), str.end(), str.begin(), ::toupper); + boost::transform(str, str.begin(), ::toupper); return str; } // faster than sscanf bool AsciiToHex(const char* _szValue, u32& result) { - char *endptr = NULL; + char *endptr = nullptr; const u32 value = strtoul(_szValue, &endptr, 16); if (!endptr || *endptr) @@ -68,7 +69,7 @@ bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list ar // will be present in the middle of a multibyte sequence. // // This is why we lookup an ANSI (cp1252) locale here and use _vsnprintf_l. - static locale_t c_locale = NULL; + static locale_t c_locale = nullptr; if (!c_locale) c_locale = _create_locale(LC_ALL, ".1252"); writtenCount = _vsnprintf_l(out, outsize, format, c_locale, args); @@ -91,7 +92,7 @@ bool CharArrayFromFormatV(char* out, int outsize, const char* format, va_list ar std::string StringFromFormat(const char* format, ...) { va_list args; - char *buf = NULL; + char *buf = nullptr; #ifdef _WIN32 int required = 0; @@ -106,7 +107,7 @@ std::string StringFromFormat(const char* format, ...) #else va_start(args, format); if (vasprintf(&buf, format, args) < 0) - ERROR_LOG(COMMON, "Unable to allocate memory for string"); + LOG_ERROR(Common, "Unable to allocate memory for string"); va_end(args); std::string temp = buf; @@ -120,11 +121,11 @@ std::string ArrayToString(const u8 *data, u32 size, int line_len, bool spaces) { std::ostringstream oss; oss << std::setfill('0') << std::hex; - + for (int line = 0; size; ++data, --size) { oss << std::setw(2) << (int)*data; - + if (line_len == ++line) { oss << '\n'; @@ -161,13 +162,13 @@ std::string StripQuotes(const std::string& s) bool TryParse(const std::string &str, u32 *const output) { - char *endptr = NULL; + char *endptr = nullptr; // Reset errno to a value other than ERANGE errno = 0; unsigned long value = strtoul(str.c_str(), &endptr, 0); - + if (!endptr || *endptr) return false; @@ -293,7 +294,7 @@ std::string ReplaceAll(std::string result, const std::string& src, const std::st //#include <string> //#include <assert.h> -const char HEX2DEC[256] = +const char HEX2DEC[256] = { /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ /* 0 */ 16,16,16,16, 16,16,16,16, 16,16,16,16, 16,16,16,16, @@ -326,7 +327,7 @@ std::string UriDecode(const std::string & sSrc) const unsigned char * pSrc = (const unsigned char *)sSrc.c_str(); const size_t SRC_LEN = sSrc.length(); const unsigned char * const SRC_END = pSrc + SRC_LEN; - const unsigned char * const SRC_LAST_DEC = SRC_END - 2; // last decodable '%' + const unsigned char * const SRC_LAST_DEC = SRC_END - 2; // last decodable '%' char * const pStart = new char[SRC_LEN]; char * pEnd = pStart; @@ -393,7 +394,7 @@ std::string UriEncode(const std::string & sSrc) for (; pSrc < SRC_END; ++pSrc) { - if (SAFE[*pSrc]) + if (SAFE[*pSrc]) *pEnd++ = *pSrc; else { @@ -411,7 +412,19 @@ std::string UriEncode(const std::string & sSrc) #ifdef _WIN32 -std::string UTF16ToUTF8(const std::wstring& input) +std::string UTF16ToUTF8(const std::u16string& input) +{ + std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; + return convert.to_bytes(input); +} + +std::u16string UTF8ToUTF16(const std::string& input) +{ + std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; + return convert.from_bytes(input); +} + +static std::string UTF16ToUTF8(const std::wstring& input) { auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr); @@ -424,7 +437,7 @@ std::string UTF16ToUTF8(const std::wstring& input) return output; } -std::wstring CPToUTF16(u32 code_page, const std::string& input) +static std::wstring CPToUTF16(u32 code_page, const std::string& input) { auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0); @@ -437,7 +450,7 @@ std::wstring CPToUTF16(u32 code_page, const std::string& input) return output; } -std::wstring UTF8ToUTF16(const std::string& input) +std::wstring UTF8ToUTF16W(const std::string &input) { return CPToUTF16(CP_UTF8, input); } @@ -455,61 +468,123 @@ std::string CP1252ToUTF8(const std::string& input) #else template <typename T> -std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input) +static std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input) { std::string result; iconv_t const conv_desc = iconv_open("UTF-8", fromcode); - if ((iconv_t)-1 == conv_desc) + if ((iconv_t)(-1) == conv_desc) { - ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno)); + LOG_ERROR(Common, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno)); + iconv_close(conv_desc); + return {}; } - else - { - size_t const in_bytes = sizeof(T) * input.size(); - size_t const out_buffer_size = 4 * in_bytes; - std::string out_buffer; - out_buffer.resize(out_buffer_size); + const size_t in_bytes = sizeof(T) * input.size(); + // Multiply by 4, which is the max number of bytes to encode a codepoint + const size_t out_buffer_size = 4 * in_bytes; - auto src_buffer = &input[0]; - size_t src_bytes = in_bytes; - auto dst_buffer = &out_buffer[0]; - size_t dst_bytes = out_buffer.size(); + std::string out_buffer; + out_buffer.resize(out_buffer_size); - while (src_bytes != 0) - { - size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes, - &dst_buffer, &dst_bytes); + auto src_buffer = &input[0]; + size_t src_bytes = in_bytes; + auto dst_buffer = &out_buffer[0]; + size_t dst_bytes = out_buffer.size(); - if ((size_t)-1 == iconv_result) + while (0 != src_bytes) + { + size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes, + &dst_buffer, &dst_bytes); + + if (static_cast<size_t>(-1) == iconv_result) + { + if (EILSEQ == errno || EINVAL == errno) { - if (EILSEQ == errno || EINVAL == errno) - { - // Try to skip the bad character - if (src_bytes != 0) - { - --src_bytes; - ++src_buffer; - } - } - else + // Try to skip the bad character + if (0 != src_bytes) { - ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno)); - break; + --src_bytes; + ++src_buffer; } } + else + { + LOG_ERROR(Common, "iconv failure [%s]: %s", fromcode, strerror(errno)); + break; + } } + } + + out_buffer.resize(out_buffer_size - dst_bytes); + out_buffer.swap(result); + + iconv_close(conv_desc); + + return result; +} + +std::u16string UTF8ToUTF16(const std::string& input) +{ + std::u16string result; - out_buffer.resize(out_buffer_size - dst_bytes); - out_buffer.swap(result); - + iconv_t const conv_desc = iconv_open("UTF-16LE", "UTF-8"); + if ((iconv_t)(-1) == conv_desc) + { + LOG_ERROR(Common, "Iconv initialization failure [UTF-8]: %s", strerror(errno)); iconv_close(conv_desc); + return {}; } - + + const size_t in_bytes = sizeof(char) * input.size(); + // Multiply by 4, which is the max number of bytes to encode a codepoint + const size_t out_buffer_size = 4 * sizeof(char16_t) * in_bytes; + + std::u16string out_buffer; + out_buffer.resize(out_buffer_size); + + char* src_buffer = const_cast<char*>(&input[0]); + size_t src_bytes = in_bytes; + char* dst_buffer = (char*)(&out_buffer[0]); + size_t dst_bytes = out_buffer.size(); + + while (0 != src_bytes) + { + size_t const iconv_result = iconv(conv_desc, &src_buffer, &src_bytes, + &dst_buffer, &dst_bytes); + + if (static_cast<size_t>(-1) == iconv_result) + { + if (EILSEQ == errno || EINVAL == errno) + { + // Try to skip the bad character + if (0 != src_bytes) + { + --src_bytes; + ++src_buffer; + } + } + else + { + LOG_ERROR(Common, "iconv failure [UTF-8]: %s", strerror(errno)); + break; + } + } + } + + out_buffer.resize(out_buffer_size - dst_bytes); + out_buffer.swap(result); + + iconv_close(conv_desc); + return result; } +std::string UTF16ToUTF8(const std::u16string& input) +{ + return CodeToUTF8("UTF-16LE", input); +} + std::string CP1252ToUTF8(const std::string& input) { //return CodeToUTF8("CP1252//TRANSLIT", input); @@ -523,19 +598,6 @@ std::string SHIFTJISToUTF8(const std::string& input) return CodeToUTF8("SJIS", input); } -std::string UTF16ToUTF8(const std::wstring& input) -{ - std::string result = - // CodeToUTF8("UCS-2", input); - // CodeToUTF8("UCS-2LE", input); - // CodeToUTF8("UTF-16", input); - CodeToUTF8("UTF-16LE", input); - - // TODO: why is this needed? - result.erase(std::remove(result.begin(), result.end(), 0x00), result.end()); - return result; -} - #endif } |