一个类似于 Native2Ascii 的东东

同事晚上问到,我花了点时间赶制的,不知道有什么问题,请大家在不同的环境下测试一下,有问题请回贴,谢谢。


package com.utstar.pizer.util.unicode;

import java.util.*;
import java.io.*;

/**
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @author not attributable
* @version 1.0
*/


public class UnicodeUtil {

public static void main(String[] args) throws Exception {
System.out.println(Integer.toHexString( (int) '你'));
System.out.println(Integer.toHexString( (int) '我'));

String tmp =
"\\u" + "4F60" + "\\u" + "6211\u0000同\\u时a\\bc\u5e87DEf_)*&^\\u^";
//String tmp = "\u4F60\u6211\u540c\u65f6\u20e6";
System.out.println(tmp);
System.out.println(
"[" + escapeUnicode(tmp) + "]");
String tmp2 = escapeUnicode(tmp);
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(
"data.txt"),"UTF-8");
osw.write(tmp2);
osw.flush();
osw.close();
long start = System.currentTimeMillis();
InputStreamReader isr = new InputStreamReader(new FileInputStream(
"data.txt"),"UTF-8");
char[] inc = new char[1024*4];
StringBuffer s = new StringBuffer();
int p = 0;
while((p=isr.read(inc,0,inc.length))>0){
s.append(inc,0,p);
}
isr.close();
System.out.println(unescapeUnicode(s.toString(), null));
long end = System.currentTimeMillis();
System.out.println(
"Time consumed:"+(end-start));
}

/*
note: '\' is not a general escape, only the pair
'\\u' and the triple '\\u' (for a literal "\\u")
our unicode-escape parsing algorithm will go something like:
1.
scan for '\\u'
2.
if previous char is '\', skip all 3 of them. go to 1.
3.
if next 4 chars are legal hex, continue, else go to 1.
4.
turn into character.
char foo = (char)Integer.parseInt(hexChars, 16);
5.
test with Character.isDefined(foo)
6.
insert into sb2, go to 1.
*/


/**
* Decode 4-hex-digit unicode escapes from a String.
* Escapes are defined in
* <a href=" http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850 ">&sect;3.3</a>
* of the
* <a href=" http://java.sun.com/docs/books/jls/second_edition/html/j.title.doc.html ">java language specification</a>.
*
* As a short example: "&#92;u00BF" would be translated into &iquest; (the inverted question mark)
*
* @param s the string to decode
* @return the decoded string, or s if there were errors.
*/

public static String unescapeUnicode(String s, String encoding) throws
UnsupportedEncodingException {
if (s == null) {
return s;
}

String decoded = s;

try {
int sindex;
int osindex;
sindex = s.indexOf(
"\\u");
osindex = 0;

if (sindex >= 0) {
// we can still have \\u, but we'll work it out.
String hex4 = null;
char tchar = '\u0000';

// we will build up our new string in here:
StringBuffer sb2 = new StringBuffer(s.length());
// sb2.append( s.substring( 0, sindex ) );

while ( (sindex >= 0) && (sindex < s.length())) {

//System.out.println(osindex+","+sindex);
sb2.append(s.substring(osindex, sindex));
osindex = sindex;

// we have a triple-esc, skip onward
if ( (sindex > 0) && (s.charAt(sindex - 1) == '\\')) {
sindex += 2;
// the length of "\\u"
sb2.append(
"u");
}
else {
// check for 4 hex digits following \\u
// make sure we _have_ 4 more chars:
if (sindex + 6 > s.length()) {
sb2.append(s.substring(sindex));
break;
//throw new StringIndexOutOfBoundsException();
}
hex4 = s.substring(sindex + 2, sindex + 6);
try {
tchar = (char) Integer.parseInt(hex4, 16);
}
catch (NumberFormatException nfe) {
sb2.append(s.substring(sindex, sindex + 2));
sindex+=2;
osindex = sindex;
sindex = s.indexOf(
"\\u", sindex);
continue;
//throw new StringIndexOutOfBoundsException();
}

// hex4 parsed to an int, now see if its a character...
if (Character.isDefined(tchar)) {
sb2.append(tchar);
}
else {
sb2.append(tchar);
//throw new StringIndexOutOfBoundsException();
}
sindex += 6;
}

osindex = sindex;
sindex = s.indexOf(
"\\u", sindex);
}

if (sindex < 0) {
// grab the rest of the string.
sb2.append(s.substring(osindex));
}

decoded = sb2.toString();
sb2 = null;
// get rid of it
}
}
catch (StringIndexOutOfBoundsException e) {
// do nothing, s will be unaffected.
return s;
}
if (encoding == null ||
"".equals(encoding)) {
encoding = System.getProperty(
"file.encoding", "ISO-8859-1");
}
return new String(decoded.getBytes(encoding));

}

public static String escapeUnicode(String s) {
if (s == null) {
return s;
}
char[] chars = s.toCharArray();
char c;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < chars.length; i++) {
c = chars[i];
//Ignore ascii character
if (c > 0xff) {
sb.append(
"\\u").append(Integer.toHexString(c));
}
else {
sb.append(c);
}
}
return sb.toString();
}
}

我想了一下,那个转码的操作是没有必要的,因为从 \u 这样的字串转回来的是Unicode 字符,所以没有必要做转码。

Native2ascii 之所以要提供 encoding 信息,是因为它要输出到文件里,只有指定了正确的编码,它才能正确输出。

与Native2Ascii 有什么区别和特点?

呵呵~~

假设有这种的应用,有人将一段文本中非 ascii 文本以 \u 的形式存在数据库中,当他需要从 \u 字串转回相应字符时,用 Native2Ascii 该如何操作?

使用 Native2Ascii 似乎不能满足 run-time 的要求吧~~