Tuesday, 8 October 2013

C++ STL how to print unicode string and characters

1. essential headers

#include <iostream>
#include <fstream>
#include <algorithm>
#include <sstream>
#include <clocale>
#include <cwchar>   // for mbstate_t
#include <cstddef>  // for std::size_t

2.
in order to use wcout to print wchar_t[] or wstring, you must first run
 setlocale( LC_ALL, "en_US.UTF-8" );
once this is set, you can no longer use cout

3.
in order to use wifstream to read, you must first run imbue() :
 wifstream fin("utf8.txt");
 fin.imbue(std::locale("en_US.UTF-8"));

sample code that works correctly:
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <locale>
#include <cwchar>   // for mbstate_t
#include <cstddef>  // for std::size_t

using namespace std;
#define def(a,b) typeof(b) a=b
#define dec(a,b) typeof(b) a

int main(int argc, char *argv[]){
    std::locale utf8_locale("en_US.UTF-8");
    wchar_t ss[] = L"你是谁我?asd asdf d";
    wstring ws(L"机器,1234电脑");

    wifstream fin("a.txt");
    fin.imbue(utf8_locale);
    wstring s;
    setlocale( LC_ALL, "en_US.UTF-8" ); // this might fail, you should check return value

    wcout << ws << endl;
    wcout << ss << endl;
    while(fin>>s){
        wcout << L"size=" << s.size() << endl;
        wcout << L"length=" << s.length() << endl;
        for(int x=0; x<s.length(); x++)
            wcout << s[x] << L" ";
        wcout << endl;
    }
    return  0;
}

a.txt contains 1 line:
我是 谁?a sd f

output:
机器,1234电脑
你是谁我?asd asdf d
size=2
length=2
我 是
size=3
length=3
谁 ? a
size=2
length=2
s d
size=1
length=1
f

Take note that your console must support utf8 and console locale must be set to en_US.UTF-8


To convert between UTF-8 encoded string and wstring, use the following:

#include <codecvt>
#include <string>

// convert UTF-8 string to wstring
std::wstring utf8_string_to_wstring (const std::string& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.from_bytes(str);
}

// convert wstring to UTF-8 string
std::string utf8_wstring_to_string (const std::wstring& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.to_bytes(str);
}
 
 
However, since <codecvt> is not implemented by gcc/g++ yet, you can only use the following to do the manual conversion:

void utf8toWStr(WStr& dest, const Str& src){
 dest.clear();
 wchar_t w = 0;
 int bytes = 0;
 wchar_t err = L'�';
 for (size_t i = 0; i < src.size(); i++){
  unsigned char c = (unsigned char)src[i];
  if (c <= 0x7f){//first byte
   if (bytes){
    dest.push_back(err);
    bytes = 0;
   }
   dest.push_back((wchar_t)c);
  }
  else if (c <= 0xbf){//second/third/etc byte
   if (bytes){
    w = ((w << 6)|(c & 0x3f));
    bytes--;
    if (bytes == 0)
     dest.push_back(w);
   }
   else
    dest.push_back(err);
  }
  else if (c <= 0xdf){//2byte sequence start
   bytes = 1;
   w = c & 0x1f;
  }
  else if (c <= 0xef){//3byte sequence start
   bytes = 2;
   w = c & 0x0f;
  }
  else if (c <= 0xf7){//3byte sequence start
   bytes = 3;
   w = c & 0x07;
  }
  else{
   dest.push_back(err);
   bytes = 0;
  }
 }
 if (bytes)
  dest.push_back(err);
}

void wstrToUtf8(Str& dest, const WStr& src){
 dest.clear();
 for (size_t i = 0; i < src.size(); i++){
  wchar_t w = src[i];
  if (w <= 0x7f)
   dest.push_back((char)w);
  else if (w <= 0x7ff){
   dest.push_back(0xc0 | ((w >> 6)& 0x1f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0xffff){
   dest.push_back(0xe0 | ((w >> 12)& 0x0f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0x10ffff){
   dest.push_back(0xf0 | ((w >> 18)& 0x07));
   dest.push_back(0x80| ((w >> 12) & 0x3f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else
   dest.push_back('?');
 }
}

Str wstrToUtf8(const WStr& str){
 Str result;
 wstrToUtf8(result, str);
 return result;
}

WStr utf8toWStr(const Str& str){
 WStr result;
 utf8toWStr(result, str);
 return result;
}

std::ostream& operator<<(std::ostream& f, const WStr& s){
 Str s1;
 wstrToUtf8(s1, s);
 f << s1;
 return f;
}

std::istream& operator>>(std::istream& f, WStr& s){
 Str s1;
 f >> s1;
 utf8toWStr(s, s1);
 return f;
}
 
 

No comments:

Post a Comment