從大五碼到 Unicode 轉換表製作程式

本章介紹了 Big5Unicode.java 源程式。它可以用來製作大五碼 (Big5) 到 Unicode 編碼轉換表。

本書列出的從大五碼到 Unicode 編碼轉換表由下面的程式所產生。這個程式採用了 Java 內部編碼轉換函數 CharsetDecoder.encode() 和 CharsetDecoder.decode(),

/* Big5Unicode.java
 - Copyright (c) 2015, HerongYang.com, All Rights Reserved.
 */
import java.io.*;
import java.nio.*;
import java.nio.charset.*;

class Big5Unicode {
  static OutputStream out = null;
  static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7',
                           '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
  static String blk_name[] = {"Special Symbols",
       "Level 1 Characters", "Level 2 Characters"};
  static int blk_first[] = {0xA140, 0xA440, 0xC940};
  static int blk_last[] = {0xA3BF, 0xC67E, 0xF9D5};
  static int blk_size[] = {408, 5401, 7652};
  static int blk_count[] = {0, 0, 0};

  public static void main(String[] args) {
    try {
      out = new FileOutputStream("big5-unicode.big5");
      writeCode();
      out.close();
    } catch (IOException e) {
      System.out.println(e.toString());
    }
  }

  public static void writeCode() throws IOException {
    String name = null;
    CharsetDecoder b5dc = Charset.forName("Big5").newDecoder();
    CharsetEncoder uxec = Charset.forName("UTF-16BE").newEncoder();
    ByteBuffer b5bb = null;
    ByteBuffer uxbb = null;
    CharBuffer cb = null;

    for (int i=0xA1; i<=0xFF; i++) {
      int blk = getBlock(i);
      if (blk==-1) continue;

      name = blk_name[blk];
      writeln();
      writeString("<p><b>Row ");
      writeHex((byte)i);
      writeString(": "+name+"</b></p>");
      writeln();

      writeln();
      writeHeader();
      for (int j=0x40; j<=0xFF; j++) {
        byte hi = (byte)(i);
        byte lo = (byte)(j);

        if (validBig5(i, j, blk)) {
          b5bb = ByteBuffer.wrap(new byte[]{hi, lo});
          try {
            cb = b5dc.decode(b5bb);
            uxbb = uxec.encode(cb);
            writeByte(hi);
            writeByte(lo);
            writeString(" ");
            writeHex(hi);
            writeHex(lo);
            blk_count[blk] = blk_count[blk] + 1;

          } catch (CharacterCodingException e) {
            cb = null;
            uxbb = null;
            writeBig5Space();
            writeString(" fail");
          }
        } else {
          cb = null;
          uxbb = null;
          writeBig5Space();
          writeString(" null");
        }

        writeString(" ");
        writeByteBuffer(uxbb, 2);

        if ((j+1)%4 == 0) {
          writeln();
        } else {
          writeString("   ");
        }
      }
      writeFooter();
    }

    for (int l=0; l<blk_name.length; l++) {
      System.out.println(blk_name[l]+": "
        + blk_count[l]+" of "+blk_size[l]);
    }
  }

  public static void writeln() throws IOException {
    out.write(0x0D);
    out.write(0x0A);
  }

  public static void writeByte(byte b) throws IOException {
    out.write(b & 0xFF);
  }

  public static void writeByteBuffer(ByteBuffer b, int l)
    throws IOException {
    int i = 0;
    if (b==null) {
      writeString("null");
      i = 2;
    } else {
      for (i=0; i<b.limit(); i++) writeHex(b.get(i));
    }
    for (int j=i; j<l; j++) writeString("  ");
  }

  public static void writeBig5Space() throws IOException {
    out.write(0xA1);
    out.write(0x40);
  }

  public static void writeString(String s) throws IOException {
    if (s!=null) {
      for (int i=0; i<s.length(); i++) {
        out.write((int) (s.charAt(i) & 0xFF));
       }
    }
  }

  public static void writeNumber(int i) throws IOException {
    String s = "00" + String.valueOf(i);
    writeString(s.substring(s.length()-2,s.length()));
  }

  public static void writeHex(byte b) throws IOException {
    out.write((int) hexDigit[(b >> 4) & 0x0F]);
    out.write((int) hexDigit[b & 0x0F]);
  }

  public static void writeHeader() throws IOException {
    writeString("<pre class=\"chinese\">");
    writeBig5Space();
    writeString(" Big5 Uni.");
    writeString("   ");
    writeBig5Space();
    writeString(" Big5 Uni.");
    writeString("   ");
    writeBig5Space();
    writeString(" Big5 Uni.");
    writeString("   ");
    writeBig5Space();
    writeString(" Big5 Uni.");
    writeln();
    writeln();
  }

  public static void writeFooter() throws IOException {
    writeString("</pre>");
    writeln();
  }

  public static boolean validBig5(int i, int j, int blk) {
    // valid ranges for j: 0x40 - 0x7E and 0xA1 - 0xFE.
    if (j<0x40) return false;
    if (j>0x7E && j<0xA1) return false;
    if (j>0xFE) return false;

    int last_i = blk_last[blk] >> 8;
    int last_j = blk_last[blk] & 0xFF;
    if (i==last_i && j>last_j) return false;

    return true;
  }

  public static int getBlock(int i) {
    for (int l=0; l<blk_first.length; l++) {
      int first = blk_first[l] >> 8;
      int last = blk_last[l] >> 8;
      if (i>=first && i<=last) return l;
    }
    return -1;
  }
}

關於程式的幾點註解:

程式可以在 JDK 8 到 JDK 20 的任何一個版本中編譯和執行。 程式的輸出結果如下:

herong$ javac Big5Unicode.java
herong$ java Big5Unicode

Special Symbols: 406 of 408
Level 1 Characters: 5401 of 5401
Level 2 Characters: 7652 of 7652

結果顯示,JDK 無法處理兩個大五碼符號: 0xA1C3 () 和 0xA1C5 (ˍ)。 我們只好在輸出的轉換錶中,做下列修改:

 A1C3 U+FFE3
ˍ A1C5 U+02CD

另外,JDK 還有三個解碼錯誤,需要修改:

A1FE: Java bug - wrong mapping ( A1FE U+2571,  A2AC U+2571)
   It should be:  A1FE U+FF0F
A240: Java bug - wrong mapping ( A240 U+2572,  A2AD U+2572)
   It should be:  A240 U+FF3C
A15A: Java bug - wrong mapping ( A15A U+FF3F, _ A1C4 U+FF3F)
  It should be:  A15A U+2574

程式輸出的轉換錶將列入本書的後面部分。

Table of Contents

 說明与摘要

 大五碼(Big5)字元集和編碼說明

從大五碼到 Unicode 轉換表製作程式

 Big5 到 Unicode 轉換 - 特殊符號

 Big5 到 Unicode 轉換 - 一級漢字

 Big5 到 Unicode 轉換 - 二級漢字

 從 Unicode 到 Big5 轉換表製作程式

 Unicode 到 Big5 轉換 - 13,461 全部字元

 參考文獻

 PDF,EPUB,以及印刷版全版