package com.utstar.pizer.util.unicode;
import java.util.*; import java.io.*;
/** * <p>Title: </p> * <p>Description: </p> * <p>Copyright: Copyright (c) 2003</p> * <p>Company: </p> * @author not attributable * @version 1.0 */
public class UnicodeUtil {
public static void main(String[] args) throws Exception { System.out.println(Integer.toHexString( (int) '你')); System.out.println(Integer.toHexString( (int) '我'));
String tmp = "\\u" + "4F60" + "\\u" + "6211\u0000同\\u时a\\bc\u5e87DEf_)*&^\\u^"; //String tmp = "\u4F60\u6211\u540c\u65f6\u20e6"; System.out.println(tmp); System.out.println("[" + escapeUnicode(tmp) + "]"); String tmp2 = escapeUnicode(tmp); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("data.txt"),"UTF-8"); osw.write(tmp2); osw.flush(); osw.close(); long start = System.currentTimeMillis(); InputStreamReader isr = new InputStreamReader(new FileInputStream("data.txt"),"UTF-8"); char[] inc = new char[1024*4]; StringBuffer s = new StringBuffer(); int p = 0; while((p=isr.read(inc,0,inc.length))>0){ s.append(inc,0,p); } isr.close(); System.out.println(unescapeUnicode(s.toString(), null)); long end = System.currentTimeMillis(); System.out.println("Time consumed:"+(end-start)); }
/* note: '\' is not a general escape, only the pair '\\u' and the triple '\\u' (for a literal "\\u") our unicode-escape parsing algorithm will go something like: 1. scan for '\\u' 2. if previous char is '\', skip all 3 of them. go to 1. 3. if next 4 chars are legal hex, continue, else go to 1. 4. turn into character. char foo = (char)Integer.parseInt(hexChars, 16); 5. test with Character.isDefined(foo) 6. insert into sb2, go to 1. */
/** * Decode 4-hex-digit unicode escapes from a String. * Escapes are defined in * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850 ">§3.3</a> * of the * <a href=" http://java.sun.com/docs/books/jls/second_edition/html/j.title.doc.html ">java language specification</a>. * * As a short example: "\u00BF" would be translated into ¿ (the inverted question mark) * * @param s the string to decode * @return the decoded string, or s if there were errors. */ public static String unescapeUnicode(String s, String encoding) throws UnsupportedEncodingException { if (s == null) { return s; }
String decoded = s;
try { int sindex; int osindex; sindex = s.indexOf("\\u"); osindex = 0;
if (sindex >= 0) { // we can still have \\u, but we'll work it out. String hex4 = null; char tchar = '\u0000';
// we will build up our new string in here: StringBuffer sb2 = new StringBuffer(s.length()); // sb2.append( s.substring( 0, sindex ) );
while ( (sindex >= 0) && (sindex < s.length())) {
//System.out.println(osindex+","+sindex); sb2.append(s.substring(osindex, sindex)); osindex = sindex;
// we have a triple-esc, skip onward if ( (sindex > 0) && (s.charAt(sindex - 1) == '\\')) { sindex += 2; // the length of "\\u" sb2.append("u"); } else { // check for 4 hex digits following \\u // make sure we _have_ 4 more chars: if (sindex + 6 > s.length()) { sb2.append(s.substring(sindex)); break; //throw new StringIndexOutOfBoundsException(); } hex4 = s.substring(sindex + 2, sindex + 6); try { tchar = (char) Integer.parseInt(hex4, 16); } catch (NumberFormatException nfe) { sb2.append(s.substring(sindex, sindex + 2)); sindex+=2; osindex = sindex; sindex = s.indexOf("\\u", sindex); continue; //throw new StringIndexOutOfBoundsException(); }
// hex4 parsed to an int, now see if its a character... if (Character.isDefined(tchar)) { sb2.append(tchar); } else { sb2.append(tchar); //throw new StringIndexOutOfBoundsException(); } sindex += 6; }
osindex = sindex; sindex = s.indexOf("\\u", sindex); }
if (sindex < 0) { // grab the rest of the string. sb2.append(s.substring(osindex)); }
decoded = sb2.toString(); sb2 = null; // get rid of it } } catch (StringIndexOutOfBoundsException e) { // do nothing, s will be unaffected. return s; } if (encoding == null || "".equals(encoding)) { encoding = System.getProperty("file.encoding", "ISO-8859-1"); } return new String(decoded.getBytes(encoding));
}
public static String escapeUnicode(String s) { if (s == null) { return s; } char[] chars = s.toCharArray(); char c; StringBuffer sb = new StringBuffer(); for (int i = 0; i < chars.length; i++) { c = chars[i]; //Ignore ascii character if (c > 0xff) { sb.append("\\u").append(Integer.toHexString(c)); } else { sb.append(c); } } return sb.toString(); } }
|