Character Encoding - ASCII, Unicode & UTF-8

2 minutes read

| --- | --- 1 | 0000 0000 to 0000 007F | 0xxxxxxx 2 | 0000 0080 to 0000 07FF | 110xxxxx 10xxxxxx 3 | 0000 0800 to 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 4 | 0001 0000 to 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

For example, the unicode of "严" is 4E25 (100111000100101). According to the table above, this character belongs to the third row. So its UTF-8 is E4B8A5 (11100100 10111000 10100101). We can see that actually unicode is different with UTF-8. But there are some libraries that can do the convert.

Preprocessing

Read UTF-8 file:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
 
public class readUTF8 {
    public static void main(String[] args){
 
        try {
            File fileDir = new File("\test.txt");
            FileInputStream fis = new FileInputStream(fileDir);
            InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
            BufferedReader in = new BufferedReader(isr);
 
            String str;
 
            while ((str = in.readLine()) != null) {
                System.out.println(str);
            }
 
            in.close();
        } 
        catch (UnsupportedEncodingException e) {
            System.out.println(e.getMessage());
        } 
        catch (IOException e) {
            System.out.println(e.getMessage());
        }
        catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }
}

Write UTF-8 file out:

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
 
public class test {
    public static void main(String[] args){
 
        try {
            File fileDir = new File("\test.txt");

            FileOutputStream fos = new FileOutputStream(fileDir);
            OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
            Writer out = new BufferedWriter(osw);
 
            out.append("Website UTF-8").append("\r\n");
            out.append("?? UTF-8").append("\r\n");
            out.append("??????? UTF-8").append("\r\n");
 
            out.flush();
            out.close();
        } 
        catch (UnsupportedEncodingException e) {
            System.out.println(e.getMessage());
        } 
        catch (IOException e) {
            System.out.println(e.getMessage());
        }
        catch (Exception e) {
            System.out.println(e.getMessage());
        } 
    }   
}

Convert from String to byte[]:

String s = "some text here";
byte[] b = s.getBytes("UTF-8");

Convert from byte[] to String:

byte[] b = {(byte) 99, (byte)97, (byte)116};
String s = new String(b, "US-ASCII");

Comments