大五碼(Big5)編碼自學筆記 - v3.14,楊和榮
Warning: GB2312 encoding used!
從 Unicode 到 Big5 轉換表製作程式
本章介紹了 UnicodeBig5.java 源程式。它可以用來製作 Unicode 到 Big5 編碼轉換表。
本書列出的從 Unicode 編碼到 Big5 編碼轉換表由下面的程式所產生。這個程式採用了 Java 內部編碼轉換函數 CharsetDecoder.encode() 和 CharsetDecoder.decode()。
/* UnicodeBig5.java
- Copyright (c) 2015, HerongYang.com, All Rights Reserved.
*/
import java.io.*;
import java.nio.*;
import java.nio.charset.*;
import java.util.*;
class UnicodeBig5 {
static OutputStream out = null;
static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
static String blk_name[] = {"Special Symbols",
"Level 1 Characters", "Level 2 Characters"};
static int blk_first[] = {0xA140, 0xA440, 0xC940};
static int blk_last[] = {0xA3BF, 0xC67E, 0xF9D5};
static int blk_size[] = {408, 5401, 7652};
static int blk_count[] = {0, 0, 0};
static Hashtable<String, Integer> code_map
= new Hashtable<String, Integer>();
public static void main(String[] a) {
setupMap();
try {
out = new FileOutputStream("unicode_big5.big5");
writeCode();
out.close();
} catch (IOException e) {
System.out.println(e.toString());
}
}
public static void writeCode() throws IOException {
CharsetEncoder b5ec = Charset.forName("Big5").newEncoder();
char[] ca = new char[1];
CharBuffer cb = null;
ByteBuffer b5bb = null;
writeHeader();
int count = 0;
for (int i=0; i<0x010000; i++) {
ca[0] = (char) i;
cb = CharBuffer.wrap(ca);
try {
b5bb = b5ec.encode(cb);
} catch (CharacterCodingException e) {
b5bb = null;
}
if (validBig5Bytes(b5bb)) {
count++;
writeHex((byte) (ca[0] >>> 8));
writeHex((byte) (ca[0] & 0xff));
writeString(" ");
writeByteBuffer(b5bb,2);
writeString(" ");
writeByte(b5bb.get(0));
writeByte(b5bb.get(1));
if (count%5 == 0) writeln();
else writeString(" ");
if (count%250 == 0) {
writeFooter();
writeHeader();
}
}
}
if (count%5 != 0) writeln();
writeFooter();
for (int l=0; l<blk_name.length; l++) {
System.out.println(blk_name[l]+": "
+ blk_count[l]+" of "+blk_size[l]);
}
System.out.println("Remaining Big5 codes:");
Enumeration<String> e = code_map.keys();
while (e.hasMoreElements()) {
System.out.println(" "+e.nextElement());
}
}
public static void setupMap() {
for (int i=0xA1; i<=0xFF; i++) {
int blk = getBlock(i);
if (blk==-1) continue;
for (int j=0x40; j<=0xFF; j++) {
if (validBig5(i, j, blk)) updateMap(i, j, "insert");
}
}
}
public static void updateMap(int i, int j, String action) {
String code = Integer.toHexString(i) + Integer.toHexString(j);
if (action=="insert") code_map.put(code, 1);
else if (action=="remove") code_map.remove(code);
}
public static int getBlock(int i) {
for (int l=0; l<blk_first.length; l++) {
int first = blk_first[l] >> 8;
int last = blk_last[l] >> 8;
if (i>=first && i<=last) return l;
}
return -1;
}
public static boolean validBig5(int i, int j, int blk) {
// valid ranges for j: 0x40 - 0x7E and 0xA1 - 0xFE.
if (j<0x40) return false;
if (j>0x7E && j<0xA1) return false;
if (j>0xFE) return false;
int last_i = blk_last[blk] >> 8;
int last_j = blk_last[blk] & 0xFF;
if (i==last_i && j>last_j) return false;
return true;
}
public static boolean validBig5Bytes(ByteBuffer b5bb) {
if (b5bb==null) return false;
else if (b5bb.limit()!=2) return false;
else {
byte hi = b5bb.get(0);
byte lo = b5bb.get(1);
int i = (hi&0xFF);
int j = (lo&0xFF);
int blk = getBlock(i);
if (blk==-1) return false;
if (validBig5(i, j, blk)) {
blk_count[blk] = blk_count[blk] + 1;
updateMap(i, j, "remove");
return true;
} else {
return false;
}
}
}
public static void writeHeader() throws IOException {
writeString("<pre class=\"chinese\">");
writeln();
writeString("Uni. Big5 ");
writeBig5Space();
writeString(" ");
writeString("Uni. Big5 ");
writeBig5Space();
writeString(" ");
writeString("Uni. Big5 ");
writeBig5Space();
writeString(" ");
writeString("Uni. Big5 ");
writeBig5Space();
writeString(" ");
writeString("Uni. Big5 ");
writeBig5Space();
writeln();
writeln();
}
public static void writeFooter() throws IOException {
writeString("</pre>");
writeln();
writeln();
}
public static void writeln() throws IOException {
out.write(0x0D);
out.write(0x0A);
}
public static void writeBig5Space() throws IOException {
out.write(0xA1);
out.write(0x40);
}
public static void writeByteBuffer(ByteBuffer b, int l)
throws IOException {
int i = 0;
if (b==null) {
writeString("null");
i = 2;
} else {
for (i=0; i<b.limit(); i++) writeHex(b.get(i));
}
for (int j=i; j<l; j++) writeString(" ");
}
public static void writeString(String s) throws IOException {
if (s!=null) {
for (int i=0; i<s.length(); i++) {
out.write((int) (s.charAt(i) & 0xFF));
}
}
}
public static void writeHex(byte b) throws IOException {
out.write((int) hexDigit[(b >> 4) & 0x0F]);
out.write((int) hexDigit[b & 0x0F]);
}
public static void writeByte(byte b) throws IOException {
out.write(b & 0xFF);
}
}
關於程式的幾點註解:
程式可以在 JDK 8 到 JDK 20 的任何一個版本中編譯和執行。 程式的輸出結果如下:
herong$ javac UnicodeBig5.java herong$ java UnicodeBig5 Special Symbols: 401 of 408 Level 1 Characters: 5401 of 5401 Level 2 Characters: 7652 of 7652 Remaining Big5 codes: a2ce a2cc a1fe a1c5 a1c3 a240 a15a
結果顯示,JDK 無法轉換出七個大五碼符號:
A2CC: 大五碼重複編碼 (十 A2CC U+5341, 十 A451 U+5341) A2CE: 大五碼重複編碼 (卅 A2CE U+5345, 卅 A4CA U+5345) A1FE: Java 錯誤 - 轉換有誤 (/ A1FE U+2571, ╱ A2AC U+2571) It should be: / A1FE U+FF0F A240: Java 錯誤 - 轉換有誤 (\ A240 U+2572, ╲ A2AD U+2572) It should be: \ A240 U+FF3C A15A: Java 錯誤 - 轉換有誤 (_ A15A U+FF3F, _ A1C4 U+FF3F) It should be: _ A15A U+2574 A1C3: Java 錯誤 - 轉換空缺 (�� A1C3 U+FFE3) A1C5: Java 錯誤 - 轉換空缺 (�� A1C5 U+02CD)
程式輸出的轉換表將列入本書的後面部分。
Table of Contents