Tuesday, 8 October 2013

C++ STL how to print unicode string and characters

1. essential headers

#include <iostream>
#include <fstream>
#include <algorithm>
#include <sstream>
#include <clocale>
#include <cwchar>   // for mbstate_t
#include <cstddef>  // for std::size_t

2.
in order to use wcout to print wchar_t[] or wstring, you must first run
 setlocale( LC_ALL, "en_US.UTF-8" );
once this is set, you can no longer use cout

3.
in order to use wifstream to read, you must first run imbue() :
 wifstream fin("utf8.txt");
 fin.imbue(std::locale("en_US.UTF-8"));

sample code that works correctly:
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <locale>
#include <cwchar>   // for mbstate_t
#include <cstddef>  // for std::size_t

using namespace std;
#define def(a,b) typeof(b) a=b
#define dec(a,b) typeof(b) a

int main(int argc, char *argv[]){
    std::locale utf8_locale("en_US.UTF-8");
    wchar_t ss[] = L"你是谁我?asd asdf d";
    wstring ws(L"机器,1234电脑");

    wifstream fin("a.txt");
    fin.imbue(utf8_locale);
    wstring s;
    setlocale( LC_ALL, "en_US.UTF-8" ); // this might fail, you should check return value

    wcout << ws << endl;
    wcout << ss << endl;
    while(fin>>s){
        wcout << L"size=" << s.size() << endl;
        wcout << L"length=" << s.length() << endl;
        for(int x=0; x<s.length(); x++)
            wcout << s[x] << L" ";
        wcout << endl;
    }
    return  0;
}

a.txt contains 1 line:
我是 谁?a sd f

output:
机器,1234电脑
你是谁我?asd asdf d
size=2
length=2
我 是
size=3
length=3
谁 ? a
size=2
length=2
s d
size=1
length=1
f

Take note that your console must support utf8 and console locale must be set to en_US.UTF-8


To convert between UTF-8 encoded string and wstring, use the following:

#include <codecvt>
#include <string>

// convert UTF-8 string to wstring
std::wstring utf8_string_to_wstring (const std::string& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.from_bytes(str);
}

// convert wstring to UTF-8 string
std::string utf8_wstring_to_string (const std::wstring& str)
{
    std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
    return myconv.to_bytes(str);
}
 
 
However, since <codecvt> is not implemented by gcc/g++ yet, you can only use the following to do the manual conversion:

void utf8toWStr(WStr& dest, const Str& src){
 dest.clear();
 wchar_t w = 0;
 int bytes = 0;
 wchar_t err = L'�';
 for (size_t i = 0; i < src.size(); i++){
  unsigned char c = (unsigned char)src[i];
  if (c <= 0x7f){//first byte
   if (bytes){
    dest.push_back(err);
    bytes = 0;
   }
   dest.push_back((wchar_t)c);
  }
  else if (c <= 0xbf){//second/third/etc byte
   if (bytes){
    w = ((w << 6)|(c & 0x3f));
    bytes--;
    if (bytes == 0)
     dest.push_back(w);
   }
   else
    dest.push_back(err);
  }
  else if (c <= 0xdf){//2byte sequence start
   bytes = 1;
   w = c & 0x1f;
  }
  else if (c <= 0xef){//3byte sequence start
   bytes = 2;
   w = c & 0x0f;
  }
  else if (c <= 0xf7){//3byte sequence start
   bytes = 3;
   w = c & 0x07;
  }
  else{
   dest.push_back(err);
   bytes = 0;
  }
 }
 if (bytes)
  dest.push_back(err);
}

void wstrToUtf8(Str& dest, const WStr& src){
 dest.clear();
 for (size_t i = 0; i < src.size(); i++){
  wchar_t w = src[i];
  if (w <= 0x7f)
   dest.push_back((char)w);
  else if (w <= 0x7ff){
   dest.push_back(0xc0 | ((w >> 6)& 0x1f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0xffff){
   dest.push_back(0xe0 | ((w >> 12)& 0x0f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else if (w <= 0x10ffff){
   dest.push_back(0xf0 | ((w >> 18)& 0x07));
   dest.push_back(0x80| ((w >> 12) & 0x3f));
   dest.push_back(0x80| ((w >> 6) & 0x3f));
   dest.push_back(0x80| (w & 0x3f));
  }
  else
   dest.push_back('?');
 }
}

Str wstrToUtf8(const WStr& str){
 Str result;
 wstrToUtf8(result, str);
 return result;
}

WStr utf8toWStr(const Str& str){
 WStr result;
 utf8toWStr(result, str);
 return result;
}

std::ostream& operator<<(std::ostream& f, const WStr& s){
 Str s1;
 wstrToUtf8(s1, s);
 f << s1;
 return f;
}

std::istream& operator>>(std::istream& f, WStr& s){
 Str s1;
 f >> s1;
 utf8toWStr(s, s1);
 return f;
}
 
 

Monday, 7 October 2013

The world's best vimrc

# my latest version of .vimrc
# F2 toggle autoindent during paste
# F3/F4 show/hide line number
# F5/F6 set/unset scrollbind
# F7/F8 toggles horizontal vertical layout of 2 file windows
# F9/F10 set/unset wrap
# F12 vimdiff ignore whitespace/tab
# insert mode mouse can scroll screen, set cursor position
# non-insert mode mouse can scroll cursor, terminal mouse highlight automatically save into copy/paste buffer
# d#d : delete # lines, contents no longer go into copy/paste buffer
# c#c : cut # lines, contents go into copy/paste buffer
# mouse can work past the 220th column
# vimdiff : all text visible (evening scheme)

filetype plugin indent on
set mouse=a
set tabstop=4
set shiftwidth=4
set autoindent
set smartindent
set nowrap
set backspace=2
syntax on
autocmd FileType python set tabstop=4|set shiftwidth=4|set noexpandtab

set noexpandtab
set pastetoggle=<F2>
nnoremap <silent> <F8> :TlistToggle<CR>
inoremap <C-J> <C-\><C-O>b
inoremap <C-K> <C-\><C-O>w
set hlsearch
set ttymouse=xterm2
hi comment ctermfg=green
set fileencodings=utf8,gb2312
map <F3> <Esc>:set number<CR>
map <F4> <Esc>:set nonumber<CR>
map <F5> <Esc>:set scrollbind<CR>
map <F6> <Esc>:set noscrollbind<CR>
map <F7> <C-W>t<C-W>K
map <F8> <C-W>t<C-W>H
map <F9> <Esc>:set wrap<CR>
map <F10> <Esc>:set nowrap<CR>
map <F12> <Esc>:set diffopt+=iwhite<CR>
"set viminfo='10,\"100,:20,%,n~/.viminfo
"set indentexpr=''
"set noautoindent
"autocmd BufRead,BufNewFile *.cu set noic cin autoindent
if &diff
    colorscheme evening
endif

if has("mouse_sgr")
    set ttymouse=sgr
else
    set ttymouse=xterm2
end

function! ResCur()
  if line("'\"") <= line("$")
    normal! g`"
    return 1
  endif
endfunction

augroup resCur
  autocmd!
  autocmd BufWinEnter * call ResCur()
augroup END

nnoremap d "_d
vnoremap d "_d
vnoremap p "_p


let g:LargeFile = 1024 * 1024 * 10
augroup LargeFile
 autocmd BufReadPre * let f=getfsize(expand("<afile>")) | if f > g:LargeFile || f == -2 | call LargeFile() | endif
augroup END

function LargeFile()
 " no syntax highlighting etc
 set eventignore+=FileType
 " save memory when other file is viewed
 setlocal bufhidden=unload
 " is read-only (write with :w new_filename)
 " setlocal buftype=nowrite
 " no undo possible
 setlocal noswapfile
 " display message
 autocmd VimEnter *  echo "The file is larger than " . (g:LargeFile / 1024 / 1024) . " MB, so some options are changed (see .vimrc for details)."
endfunction