package com.utstar.pizer.util.unicode;
import java.util.*;
import java.io.*;
/**
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @author not attributable
* @version 1.0
*/
public class UnicodeUtil {
public static void main(String[] args) throws Exception {
System.out.println(Integer.toHexString( (int) '你'));
System.out.println(Integer.toHexString( (int) '我'));
String tmp = "\\u" + "4F60" + "\\u" + "6211\u0000同\\u时a\\bc\u5e87DEf_)*&^\\u^";
//String tmp = "\u4F60\u6211\u540c\u65f6\u20e6";
System.out.println(tmp);
System.out.println("[" + escapeUnicode(tmp) + "]");
String tmp2 = escapeUnicode(tmp);
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("data.txt"),"UTF-8");
osw.write(tmp2);
osw.flush();
osw.close();
long start = System.currentTimeMillis();
InputStreamReader isr = new InputStreamReader(new FileInputStream("data.txt"),"UTF-8");
char[] inc = new char[1024*4];
StringBuffer s = new StringBuffer();
int p = 0;
while((p=isr.read(inc,0,inc.length))>0){
s.append(inc,0,p);
}
isr.close();
System.out.println(unescapeUnicode(s.toString(), null));
long end = System.currentTimeMillis();
System.out.println("Time consumed:"+(end-start));
}
/*
note: '\' is not a general escape, only the pair
'\\u' and the triple '\\u' (for a literal "\\u")
our unicode-escape parsing algorithm will go something like:
1.
scan for '\\u'
2.
if previous char is '\', skip all 3 of them. go to 1.
3.
if next 4 chars are legal hex, continue, else go to 1.
4.
turn into character.
char foo = (char)Integer.parseInt(hexChars, 16);
5.
test with Character.isDefined(foo)
6.
insert into sb2, go to 1.
*/
/**
* Decode 4-hex-digit unicode escapes from a String.
* Escapes are defined in
* <a href=" http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850 ">§3.3</a>
* of the
* <a href=" http://java.sun.com/docs/books/jls/second_edition/html/j.title.doc.html ">java language specification</a>.
*
* As a short example: "\u00BF" would be translated into ¿ (the inverted question mark)
*
* @param s the string to decode
* @return the decoded string, or s if there were errors.
*/
public static String unescapeUnicode(String s, String encoding) throws
UnsupportedEncodingException {
if (s == null) {
return s;
}
String decoded = s;
try {
int sindex;
int osindex;
sindex = s.indexOf("\\u");
osindex = 0;
if (sindex >= 0) { // we can still have \\u, but we'll work it out.
String hex4 = null;
char tchar = '\u0000';
// we will build up our new string in here:
StringBuffer sb2 = new StringBuffer(s.length());
// sb2.append( s.substring( 0, sindex ) );
while ( (sindex >= 0) && (sindex < s.length())) {
//System.out.println(osindex+","+sindex);
sb2.append(s.substring(osindex, sindex));
osindex = sindex;
// we have a triple-esc, skip onward
if ( (sindex > 0) && (s.charAt(sindex - 1) == '\\')) {
sindex += 2; // the length of "\\u"
sb2.append("u");
}
else { // check for 4 hex digits following \\u
// make sure we _have_ 4 more chars:
if (sindex + 6 > s.length()) {
sb2.append(s.substring(sindex));
break;
//throw new StringIndexOutOfBoundsException();
}
hex4 = s.substring(sindex + 2, sindex + 6);
try {
tchar = (char) Integer.parseInt(hex4, 16);
}
catch (NumberFormatException nfe) {
sb2.append(s.substring(sindex, sindex + 2));
sindex+=2;
osindex = sindex;
sindex = s.indexOf("\\u", sindex);
continue;
//throw new StringIndexOutOfBoundsException();
}
// hex4 parsed to an int, now see if its a character...
if (Character.isDefined(tchar)) {
sb2.append(tchar);
}
else {
sb2.append(tchar);
//throw new StringIndexOutOfBoundsException();
}
sindex += 6;
}
osindex = sindex;
sindex = s.indexOf("\\u", sindex);
}
if (sindex < 0) { // grab the rest of the string.
sb2.append(s.substring(osindex));
}
decoded = sb2.toString();
sb2 = null; // get rid of it
}
}
catch (StringIndexOutOfBoundsException e) {
// do nothing, s will be unaffected.
return s;
}
if (encoding == null || "".equals(encoding)) {
encoding = System.getProperty("file.encoding", "ISO-8859-1");
}
return new String(decoded.getBytes(encoding));
}
public static String escapeUnicode(String s) {
if (s == null) {
return s;
}
char[] chars = s.toCharArray();
char c;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < chars.length; i++) {
c = chars[i];
//Ignore ascii character
if (c > 0xff) {
sb.append("\\u").append(Integer.toHexString(c));
}
else {
sb.append(c);
}
}
return sb.toString();
}
}
<p class="indent">
|