benchmarks/regression/CharsetBenchmark.java - platform/libcore - Git at Google

 /*
  * Copyright (C) 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package benchmarks.regression;

 import java.nio.charset.Charset;
 import com.google.caliper.Param;
 import com.google.caliper.SimpleBenchmark;

 public class CharsetBenchmark extends SimpleBenchmark {
     @Param({ "1", "10", "100", "1000", "10000" })
     private int length;

     // canonical    => canonical charset name
     // built-in     => guaranteed-present charset
     // special-case => libcore treats this charset specially for performance
     @Param({
         "UTF-16",     //     canonical,     built-in, non-special-case
         "UTF-8",      //     canonical,     built-in,     special-case
         "UTF8",       // non-canonical,     built-in,     special-case
         "ISO-8859-1", //     canonical,     built-in,     special-case
         "8859_1",     // non-canonical,     built-in,     special-case
         "ISO-8859-2", //     canonical, non-built-in, non-special-case
         "8859_2",     // non-canonical, non-built-in, non-special-case
         "US-ASCII",   //     canonical,     built-in,     special-case
         "ASCII"       // non-canonical,     built-in,     special-case
     })
     private String name;

     public void time_new_String_BString(int reps) throws Exception {
         byte[] bytes = makeBytes(makeString(length));
         for (int i = 0; i < reps; ++i) {
             new String(bytes, name);
         }
     }

     public void time_new_String_BII(int reps) throws Exception {
       byte[] bytes = makeBytes(makeString(length));
       for (int i = 0; i < reps; ++i) {
         new String(bytes, 0, bytes.length);
       }
     }

     public void time_new_String_BIIString(int reps) throws Exception {
       byte[] bytes = makeBytes(makeString(length));
       for (int i = 0; i < reps; ++i) {
         new String(bytes, 0, bytes.length, name);
       }
     }

     public void time_String_getBytes(int reps) throws Exception {
         String string = makeString(length);
         for (int i = 0; i < reps; ++i) {
             string.getBytes(name);
         }
     }

     // FIXME: benchmark this pure-java implementation for US-ASCII and ISO-8859-1 too!

     /**
      * Translates the given characters to US-ASCII or ISO-8859-1 bytes, using the fact that
      * Unicode code points between U+0000 and U+007f inclusive are identical to US-ASCII, while
      * U+0000 to U+00ff inclusive are identical to ISO-8859-1.
      */
     private static byte[] toDirectMappedBytes(char[] chars, int offset, int length, int maxValidChar) {
         byte[] result = new byte[length];
         int o = offset;
         for (int i = 0; i < length; ++i) {
             int ch = chars[o++];
             result[i] = (byte) ((ch <= maxValidChar) ? ch : '?');
         }
         return result;
     }

     private static byte[] toUtf8Bytes(char[] chars, int offset, int length) {
         UnsafeByteSequence result = new UnsafeByteSequence(length);
         toUtf8Bytes(chars, offset, length, result);
         return result.toByteArray();
     }

     private static void toUtf8Bytes(char[] chars, int offset, int length, UnsafeByteSequence out) {
         final int end = offset + length;
         for (int i = offset; i < end; ++i) {
             int ch = chars[i];
             if (ch < 0x80) {
                 // One byte.
                 out.write(ch);
             } else if (ch < 0x800) {
                 // Two bytes.
                 out.write((ch >> 6) | 0xc0);
                 out.write((ch & 0x3f) | 0x80);
             } else if (ch >= Character.MIN_SURROGATE && ch <= Character.MAX_SURROGATE) {
                 // A supplementary character.
                 char high = (char) ch;
                 char low = (i + 1 != end) ? chars[i + 1] : '\u0000';
                 if (!Character.isSurrogatePair(high, low)) {
                     out.write('?');
                     continue;
                 }
                 // Now we know we have a *valid* surrogate pair, we can consume the low surrogate.
                 ++i;
                 ch = Character.toCodePoint(high, low);
                 // Four bytes.
                 out.write((ch >> 18) | 0xf0);
                 out.write(((ch >> 12) & 0x3f) | 0x80);
                 out.write(((ch >> 6) & 0x3f) | 0x80);
                 out.write((ch & 0x3f) | 0x80);
             } else {
                 // Three bytes.
                 out.write((ch >> 12) | 0xe0);
                 out.write(((ch >> 6) & 0x3f) | 0x80);
                 out.write((ch & 0x3f) | 0x80);
             }
         }
     }

     private static String makeString(int length) {
         StringBuilder result = new StringBuilder(length);
         for (int i = 0; i < length; ++i) {
             result.append('A' + (i % 26));
         }
         return result.toString();
     }

     private static byte[] makeBytes(String s) {
         try {
             return s.getBytes("US-ASCII");
         } catch (Exception ex) {
             throw new RuntimeException(ex);
         }
     }
 }
	/*
	* Copyright (C) 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package benchmarks.regression;

	import java.nio.charset.Charset;
	import com.google.caliper.Param;
	import com.google.caliper.SimpleBenchmark;

	public class CharsetBenchmark extends SimpleBenchmark {
	@Param({ "1", "10", "100", "1000", "10000" })
	private int length;

	// canonical => canonical charset name
	// built-in => guaranteed-present charset
	// special-case => libcore treats this charset specially for performance
	@Param({
	"UTF-16", // canonical, built-in, non-special-case
	"UTF-8", // canonical, built-in, special-case
	"UTF8", // non-canonical, built-in, special-case
	"ISO-8859-1", // canonical, built-in, special-case
	"8859_1", // non-canonical, built-in, special-case
	"ISO-8859-2", // canonical, non-built-in, non-special-case
	"8859_2", // non-canonical, non-built-in, non-special-case
	"US-ASCII", // canonical, built-in, special-case
	"ASCII" // non-canonical, built-in, special-case
	})
	private String name;

	public void time_new_String_BString(int reps) throws Exception {
	byte[] bytes = makeBytes(makeString(length));
	for (int i = 0; i < reps; ++i) {
	new String(bytes, name);
	}
	}

	public void time_new_String_BII(int reps) throws Exception {
	byte[] bytes = makeBytes(makeString(length));
	for (int i = 0; i < reps; ++i) {
	new String(bytes, 0, bytes.length);
	}
	}

	public void time_new_String_BIIString(int reps) throws Exception {
	byte[] bytes = makeBytes(makeString(length));
	for (int i = 0; i < reps; ++i) {
	new String(bytes, 0, bytes.length, name);
	}
	}

	public void time_String_getBytes(int reps) throws Exception {
	String string = makeString(length);
	for (int i = 0; i < reps; ++i) {
	string.getBytes(name);
	}
	}

	// FIXME: benchmark this pure-java implementation for US-ASCII and ISO-8859-1 too!

	/**
	* Translates the given characters to US-ASCII or ISO-8859-1 bytes, using the fact that
	* Unicode code points between U+0000 and U+007f inclusive are identical to US-ASCII, while
	* U+0000 to U+00ff inclusive are identical to ISO-8859-1.
	*/
	private static byte[] toDirectMappedBytes(char[] chars, int offset, int length, int maxValidChar) {
	byte[] result = new byte[length];
	int o = offset;
	for (int i = 0; i < length; ++i) {
	int ch = chars[o++];
	result[i] = (byte) ((ch <= maxValidChar) ? ch : '?');
	}
	return result;
	}

	private static byte[] toUtf8Bytes(char[] chars, int offset, int length) {
	UnsafeByteSequence result = new UnsafeByteSequence(length);
	toUtf8Bytes(chars, offset, length, result);
	return result.toByteArray();
	}

	private static void toUtf8Bytes(char[] chars, int offset, int length, UnsafeByteSequence out) {
	final int end = offset + length;
	for (int i = offset; i < end; ++i) {
	int ch = chars[i];
	if (ch < 0x80) {
	// One byte.
	out.write(ch);
	} else if (ch < 0x800) {
	// Two bytes.
	out.write((ch >> 6) \| 0xc0);
	out.write((ch & 0x3f) \| 0x80);
	} else if (ch >= Character.MIN_SURROGATE && ch <= Character.MAX_SURROGATE) {
	// A supplementary character.
	char high = (char) ch;
	char low = (i + 1 != end) ? chars[i + 1] : '\u0000';
	if (!Character.isSurrogatePair(high, low)) {
	out.write('?');
	continue;
	}
	// Now we know we have a valid surrogate pair, we can consume the low surrogate.
	++i;
	ch = Character.toCodePoint(high, low);
	// Four bytes.
	out.write((ch >> 18) \| 0xf0);
	out.write(((ch >> 12) & 0x3f) \| 0x80);
	out.write(((ch >> 6) & 0x3f) \| 0x80);
	out.write((ch & 0x3f) \| 0x80);
	} else {
	// Three bytes.
	out.write((ch >> 12) \| 0xe0);
	out.write(((ch >> 6) & 0x3f) \| 0x80);
	out.write((ch & 0x3f) \| 0x80);
	}
	}
	}

	private static String makeString(int length) {
	StringBuilder result = new StringBuilder(length);
	for (int i = 0; i < length; ++i) {
	result.append('A' + (i % 26));
	}
	return result.toString();
	}

	private static byte[] makeBytes(String s) {
	try {
	return s.getBytes("US-ASCII");
	} catch (Exception ex) {
	throw new RuntimeException(ex);
	}
	}
	}