一个类似于 Native2Ascii 的东东

03-09-03 iceant
同事晚上问到,我花了点时间赶制的,不知道有什么问题,请大家在不同的环境下测试一下,有问题请回贴,谢谢。

package com.utstar.pizer.util.unicode;

import java.util.*;
import java.io.*;

/**
 * <p>Title: </p>
 * <p>Description: </p>
 * <p>Copyright: Copyright (c) 2003</p>
 * <p>Company: </p>
 * @author not attributable
 * @version 1.0
 */

public class UnicodeUtil {

  public static void main(String[] args) throws Exception {
    System.out.println(Integer.toHexString( (int) '你'));
    System.out.println(Integer.toHexString( (int) '我'));

    String tmp = "\\u" + "4F60" + "\\u" + "6211\u0000同\\u时a\\bc\u5e87DEf_)*&^\\u^";
    //String tmp = "\u4F60\u6211\u540c\u65f6\u20e6";
    System.out.println(tmp);
    System.out.println("[" + escapeUnicode(tmp) + "]");
    String tmp2 = escapeUnicode(tmp);
    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("data.txt"),"UTF-8");
    osw.write(tmp2);
    osw.flush();
    osw.close();
    long start = System.currentTimeMillis();
    InputStreamReader isr = new InputStreamReader(new FileInputStream("data.txt"),"UTF-8");
    char[] inc = new char[1024*4];
    StringBuffer s = new StringBuffer();
    int p = 0;
    while((p=isr.read(inc,0,inc.length))>0){
      s.append(inc,0,p);
    }
    isr.close();
    System.out.println(unescapeUnicode(s.toString(), null));
    long end = System.currentTimeMillis();
    System.out.println("Time consumed:"+(end-start));
  }

  /*
     note: '\' is not a general escape, only the pair
     '\\u' and the triple '\\u' (for a literal "\\u")
     our unicode-escape parsing algorithm will go something like:
     1.
     scan for '\\u'
     2.
     if previous char is '\', skip all 3 of them. go to 1.
     3.
     if next 4 chars are legal hex, continue, else go to 1.
     4.
     turn into character.
     char foo = (char)Integer.parseInt(hexChars, 16);
     5.
     test with Character.isDefined(foo)
     6.
     insert into sb2, go to 1.
   */

  /**
   * Decode 4-hex-digit unicode escapes from a String.
   * Escapes are defined in
   * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850 ">&sect;3.3</a>
   * of the
   * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/j.title.doc.html ">java language specification</a>.
   *
   * As a short example: "&#92;u00BF" would be translated into &iquest; (the inverted question mark)
   *
   * @param s the string to decode
   * @return the decoded string, or s if there were errors.
   */
  public static String unescapeUnicode(String s, String encoding) throws
      UnsupportedEncodingException {
    if (s == null) {
      return s;
    }

    String decoded = s;

    try {
      int sindex;
      int osindex;
      sindex = s.indexOf("\\u");
      osindex = 0;

      if (sindex >= 0) { // we can still have \\u, but we'll work it out.
        String hex4 = null;
        char tchar = '\u0000';

        // we will build up our new string in here:
        StringBuffer sb2 = new StringBuffer(s.length());
        // sb2.append( s.substring( 0, sindex ) );

        while ( (sindex >= 0) && (sindex < s.length())) {

          //System.out.println(osindex+","+sindex);
          sb2.append(s.substring(osindex, sindex));
          osindex = sindex;

          // we have a triple-esc, skip onward
          if ( (sindex > 0) && (s.charAt(sindex - 1) == '\\')) {
            sindex += 2; // the length of "\\u"
            sb2.append("u");
          }
          else { // check for 4 hex digits following \\u
            // make sure we _have_ 4 more chars:
            if (sindex + 6 > s.length()) {
              sb2.append(s.substring(sindex));
              break;
              //throw new StringIndexOutOfBoundsException();
            }
            hex4 = s.substring(sindex + 2, sindex + 6);
            try {
              tchar = (char) Integer.parseInt(hex4, 16);
            }
            catch (NumberFormatException nfe) {
              sb2.append(s.substring(sindex, sindex + 2));
              sindex+=2;
              osindex = sindex;
              sindex = s.indexOf("\\u", sindex);
              continue;
              //throw new StringIndexOutOfBoundsException();
            }

            // hex4 parsed to an int, now see if its a character...
            if (Character.isDefined(tchar)) {
              sb2.append(tchar);
            }
            else {
              sb2.append(tchar);
              //throw new StringIndexOutOfBoundsException();
            }
            sindex += 6;
          }

          osindex = sindex;
          sindex = s.indexOf("\\u", sindex);
        }

        if (sindex < 0) { // grab the rest of the string.
          sb2.append(s.substring(osindex));
        }

        decoded = sb2.toString();
        sb2 = null; // get rid of it
      }
    }
    catch (StringIndexOutOfBoundsException e) {
      // do nothing, s will be unaffected.
      return s;
    }
    if (encoding == null || "".equals(encoding)) {
      encoding = System.getProperty("file.encoding", "ISO-8859-1");
    }
    return new String(decoded.getBytes(encoding));

  }

  public static String escapeUnicode(String s) {
    if (s == null) {
      return s;
    }
    char[] chars = s.toCharArray();
    char c;
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < chars.length; i++) {
      c = chars[i];
      //Ignore ascii character
      if (c > 0xff) {
        sb.append("\\u").append(Integer.toHexString(c));
      }
      else {
        sb.append(c);
      }
    }
    return sb.toString();
  }
}

<p>

iceant
2003-09-04 09:23
我想了一下,那个转码的操作是没有必要的,因为从 \u 这样的字串转回来的是Unicode 字符,所以没有必要做转码。

Native2ascii 之所以要提供 encoding 信息,是因为它要输出到文件里,只有指定了正确的编码,它才能正确输出。

banq
2003-09-04 09:37
与Native2Ascii 有什么区别和特点?

iceant
2003-09-04 09:45
呵呵~~

假设有这种的应用,有人将一段文本中非 ascii 文本以 \u 的形式存在数据库中,当他需要从 \u 字串转回相应字符时,用 Native2Ascii 该如何操作?

使用 Native2Ascii 似乎不能满足 run-time 的要求吧~~

猜你喜欢