Wang Xuancong's Home Page: C++ STL how to print unicode string and characters

1. essential headers

#include <iostream>
#include <fstream>
#include <algorithm>
#include <sstream>
#include <clocale>
#include <cwchar> // for mbstate_t
#include <cstddef> // for std::size_t

in order to use wcout to print wchar_t[] or wstring, you must first run

setlocale( LC_ALL, "en_US.UTF-8" );
once this is set, you can no longer use cout

in order to use wifstream to read, you must first run imbue() :

wifstream fin("utf8.txt");

fin.imbue(std::locale("en_US.UTF-8"));

sample code that works correctly:

#include <stdio.h>

#include <stdlib.h>

#include <string>

#include <iostream>

#include <fstream>

#include <sstream>

#include <locale>

#include <cwchar> // for mbstate_t

#include <cstddef> // for std::size_t

using namespace std;

#define def(a,b) typeof(b) a=b

#define dec(a,b) typeof(b) a

int main(int argc, char *argv[]){

std::locale utf8_locale("en_US.UTF-8");

wchar_t ss[] = L"你是谁我？asd asdf d";

wstring ws(L"机器，1234电脑");

wifstream fin("a.txt");

fin.imbue(utf8_locale);

wstring s;

setlocale( LC_ALL, "en_US.UTF-8" ); // this might fail, you should check return value

wcout << ws << endl;

wcout << ss << endl;

while(fin>>s){

wcout << L"size=" << s.size() << endl;

wcout << L"length=" << s.length() << endl;

for(int x=0; x<s.length(); x++)

wcout << s[x] << L" ";

wcout << endl;

}

return 0;

}

a.txt contains 1 line:

我是谁？a sd f

output:

机器，1234电脑

你是谁我？asd asdf d

size=2

length=2

我是

size=3

length=3

谁？ a

size=2

length=2

s d

size=1

length=1

Take note that your console must support utf8 and console locale must be set to en_US.UTF-8

To convert between UTF-8 encoded string and wstring, use the following:

#include <codecvt>
#include <string>

// convert UTF-8 string to wstring
std::wstring utf8_string_to_wstring (const std::string& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.from_bytes(str);
}

// convert wstring to UTF-8 string
std::string utf8_wstring_to_string (const std::wstring& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.to_bytes(str);
}

 
However, since <codecvt> is not implemented by gcc/g++ yet, you can only use the following to do the manual conversion:

void utf8toWStr(WStr& dest, const Str& src){
 dest.clear();
 wchar_t w = 0;
 int bytes = 0;
 wchar_t err = L'�';
 for (size_t i = 0; i < src.size(); i++){
  unsigned char c = (unsigned char)src[i];
  if (c <= 0x7f){//first byte
   if (bytes){
    dest.push_back(err);
    bytes = 0;
   }
   dest.push_back((wchar_t)c);
  }
  else if (c <= 0xbf){//second/third/etc byte
   if (bytes){
    w = ((w << 6)|(c & 0x3f));
    bytes--;
    if (bytes == 0)
     dest.push_back(w);
   }
   else
    dest.push_back(err);
  }
  else if (c <= 0xdf){//2byte sequence start
   bytes = 1;
   w = c & 0x1f;
  }
  else if (c <= 0xef){//3byte sequence start
   bytes = 2;
   w = c & 0x0f;
  }
  else if (c <= 0xf7){//3byte sequence start
   bytes = 3;
   w = c & 0x07;
  }
  else{
   dest.push_back(err);
   bytes = 0;
  }
 }
 if (bytes)
  dest.push_back(err);
}

void wstrToUtf8(Str& dest, const WStr& src){
 dest.clear();
 for (size_t i = 0; i < src.size(); i++){
  wchar_t w = src[i];
  if (w <= 0x7f)
   dest.push_back((char)w);
  else if (w <= 0x7ff){
   dest.push_back(0xc0 | ((w >> 6)& 0x1f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0xffff){
   dest.push_back(0xe0 | ((w >> 12)& 0x0f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0x10ffff){
   dest.push_back(0xf0 | ((w >> 18)& 0x07));
   dest.push_back(0x80| ((w >> 12) & 0x3f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else
   dest.push_back('?');
 }
}

Str wstrToUtf8(const WStr& str){
 Str result;
 wstrToUtf8(result, str);
 return result;
}

WStr utf8toWStr(const Str& str){
 WStr result;
 utf8toWStr(result, str);
 return result;
}

std::ostream& operator<<(std::ostream& f, const WStr& s){
 Str s1;
 wstrToUtf8(s1, s);
 f << s1;
 return f;
}

std::istream& operator>>(std::istream& f, WStr& s){
 Str s1;
 f >> s1;
 utf8toWStr(s, s1);
 return f;
}

Wang Xuancong's Home Page

Tuesday, 8 October 2013

C++ STL how to print unicode string and characters

No comments:

Post a Comment