#include <iostream>
#include <fstream>
#include <algorithm>
#include <sstream>
#include <clocale>
#include <cwchar> // for mbstate_t
#include <cstddef> // for std::size_t
2.
in order to use wcout to print wchar_t[] or wstring, you must first run
setlocale( LC_ALL, "en_US.UTF-8" );
once this is set, you can no longer use cout
once this is set, you can no longer use cout
3.
in order to use wifstream to read, you must first run imbue() :
wifstream fin("utf8.txt");
fin.imbue(std::locale("en_US.UTF-8"));
sample code that works correctly:
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <locale>
#include <cwchar> // for mbstate_t
#include <cstddef> // for std::size_t
using namespace std;
#define def(a,b) typeof(b) a=b
#define dec(a,b) typeof(b) a
int main(int argc, char *argv[]){
std::locale utf8_locale("en_US.UTF-8");
wchar_t ss[] = L"你是谁我?asd asdf d";
wstring ws(L"机器,1234电脑");
wifstream fin("a.txt");
fin.imbue(utf8_locale);
wstring s;
setlocale( LC_ALL, "en_US.UTF-8" ); // this might fail, you should check return value
wcout << ws << endl;
wcout << ss << endl;
while(fin>>s){
wcout << L"size=" << s.size() << endl;
wcout << L"length=" << s.length() << endl;
for(int x=0; x<s.length(); x++)
wcout << s[x] << L" ";
wcout << endl;
}
return 0;
}
a.txt contains 1 line:
我是 谁?a sd f
output:
机器,1234电脑
你是谁我?asd asdf d
size=2
length=2
我 是
size=3
length=3
谁 ? a
size=2
length=2
s d
size=1
length=1
f
Take note that your console must support utf8 and console locale must be set to en_US.UTF-8
To convert between UTF-8 encoded string and wstring, use the following:
#include <codecvt>
#include <string>
// convert UTF-8 string to wstring
std::wstring utf8_string_to_wstring (const std::string& str)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
return myconv.from_bytes(str);
}
// convert wstring to UTF-8 string
std::string utf8_wstring_to_string (const std::wstring& str)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
return myconv.to_bytes(str);
}
However, since <codecvt> is not implemented by gcc/g++ yet, you can only use the following to do the manual conversion:
void utf8toWStr(WStr& dest, const Str& src){ dest.clear(); wchar_t w = 0; int bytes = 0; wchar_t err = L'�'; for (size_t i = 0; i < src.size(); i++){ unsigned char c = (unsigned char)src[i]; if (c <= 0x7f){//first byte if (bytes){ dest.push_back(err); bytes = 0; } dest.push_back((wchar_t)c); } else if (c <= 0xbf){//second/third/etc byte if (bytes){ w = ((w << 6)|(c & 0x3f)); bytes--; if (bytes == 0) dest.push_back(w); } else dest.push_back(err); } else if (c <= 0xdf){//2byte sequence start bytes = 1; w = c & 0x1f; } else if (c <= 0xef){//3byte sequence start bytes = 2; w = c & 0x0f; } else if (c <= 0xf7){//3byte sequence start bytes = 3; w = c & 0x07; } else{ dest.push_back(err); bytes = 0; } } if (bytes) dest.push_back(err); } void wstrToUtf8(Str& dest, const WStr& src){ dest.clear(); for (size_t i = 0; i < src.size(); i++){ wchar_t w = src[i]; if (w <= 0x7f) dest.push_back((char)w); else if (w <= 0x7ff){ dest.push_back(0xc0 | ((w >> 6)& 0x1f)); dest.push_back(0x80| (w & 0x3f)); } else if (w <= 0xffff){ dest.push_back(0xe0 | ((w >> 12)& 0x0f)); dest.push_back(0x80| ((w >> 6) & 0x3f)); dest.push_back(0x80| (w & 0x3f)); } else if (w <= 0x10ffff){ dest.push_back(0xf0 | ((w >> 18)& 0x07)); dest.push_back(0x80| ((w >> 12) & 0x3f)); dest.push_back(0x80| ((w >> 6) & 0x3f)); dest.push_back(0x80| (w & 0x3f)); } else dest.push_back('?'); } } Str wstrToUtf8(const WStr& str){ Str result; wstrToUtf8(result, str); return result; } WStr utf8toWStr(const Str& str){ WStr result; utf8toWStr(result, str); return result; } std::ostream& operator<<(std::ostream& f, const WStr& s){ Str s1; wstrToUtf8(s1, s); f << s1; return f; } std::istream& operator>>(std::istream& f, WStr& s){ Str s1; f >> s1; utf8toWStr(s, s1); return f; }