Character Encoding - ASCII, Unicode & UTF-8
2 minutes read
| --- | --- 1 | 0000 0000 to 0000 007F | 0xxxxxxx 2 | 0000 0080 to 0000 07FF | 110xxxxx 10xxxxxx 3 | 0000 0800 to 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 4 | 0001 0000 to 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
For example, the unicode of "严" is 4E25 (100111000100101). According to the table above, this character belongs to the third row. So its UTF-8 is E4B8A5 (11100100 10111000 10100101). We can see that actually unicode is different with UTF-8. But there are some libraries that can do the convert.
Preprocessing
Read UTF-8 file:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
public class readUTF8 {
public static void main(String[] args){
try {
File fileDir = new File("\test.txt");
FileInputStream fis = new FileInputStream(fileDir);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader in = new BufferedReader(isr);
String str;
while ((str = in.readLine()) != null) {
System.out.println(str);
}
in.close();
}
catch (UnsupportedEncodingException e) {
System.out.println(e.getMessage());
}
catch (IOException e) {
System.out.println(e.getMessage());
}
catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
Write UTF-8 file out:
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
public class test {
public static void main(String[] args){
try {
File fileDir = new File("\test.txt");
FileOutputStream fos = new FileOutputStream(fileDir);
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
Writer out = new BufferedWriter(osw);
out.append("Website UTF-8").append("\r\n");
out.append("?? UTF-8").append("\r\n");
out.append("??????? UTF-8").append("\r\n");
out.flush();
out.close();
}
catch (UnsupportedEncodingException e) {
System.out.println(e.getMessage());
}
catch (IOException e) {
System.out.println(e.getMessage());
}
catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
Convert from String to byte[]:
String s = "some text here";
byte[] b = s.getBytes("UTF-8");
Convert from byte[] to String:
byte[] b = {(byte) 99, (byte)97, (byte)116};
String s = new String(b, "US-ASCII");