请教3种分割字符串方法的效率问题

云想慕尘 2015-06-13 03:21:24

3种常用的分割方法，按道理讲应该是最下面一种效率最高，但实际测试发现第3种竟然最低，求原因

String str = null;

	/**

	 * 生成一个字符串

	 */

	public MySplit() {

		StringBuilder sb = new StringBuilder();

		for (int i = 0; i < 1000; i++) {

			sb.append(i);

			sb.append(";");

		}

		str = sb.toString();

	}



	/**

	 * 使用split分割

	 */

	public void strSplit() {

		for (int i = 0; i < 10; i++) {

			str.split(";");

		}

	}



	/**

	 * 使用StringTokenizer类分割

	 */

	public void strTokenizer() {

		StringTokenizer st = new StringTokenizer(str, ";");

		for (int i = 0; i < 10; i++) {

			while (st.hasMoreTokens())

				st.nextToken();

			st = new StringTokenizer(str, ";");

		}

	}



	/**

	 * 使用indexOf和substring手工编码方式分割

	 */

	public void strIndexOf() {

		String tmp = str;

		for (int i = 0; i < 10; i++) {

			while (true) {

				int idx = tmp.indexOf(';');

				if (idx < 0)

					break;

				tmp = new String(tmp.substring(idx + 1));

			}

			tmp = str;

		}

	}

...全文

873 24 打赏收藏转发到动态举报

写回复

用AI写文章

24 条回复

切换为时间正序

请发表友善的回复…

发表回复

groovy2007 2015-06-18

打赏
举报

楼主是在jdk几上测试的？ substring的实现在jdk6和7之间发生了变化以前是引用原字符串，只记录起至位置后来改成了深度copy（避免内存泄露），这样的话效率会低很多 http://www.cnblogs.com/antineutrino/p/4213268.html

跳动de指尖 2015-06-17

打赏
举报

引用 22 楼 zys59 的回复:

[quote=引用 19 楼 pengqian098 的回复:]
String a = "|aa|bb";
a.split("|").length
//结果是多少？

三仙半 2015-06-17

打赏
举报

引用 19 楼 pengqian098 的回复:


String a = "|aa|bb";
a.split("|").length
//结果是多少？

三仙半 2015-06-17

打赏
举报

引用 19 楼 pengqian098 的回复:


String a = "|aa|bb";
a.split("|").length
//结果是多少？

考试题？

跳动de指尖 2015-06-16

打赏
举报


String a = "|aa|bb";
a.split("|").length
//结果是多少？

408985552 2015-06-16

打赏
举报

学习了,看了蛮有收获.

JPF1024 2015-06-15

打赏
举报

引用 16 楼 zys59 的回复:

下面是我测试时用的代码。mySplite()是按照我的那个思路写的分割函数，只是把变量e去掉了。反复运行了多次，每次时间都不一样，但是，规律没有变，我在注释里写了其中两次实验的结果。


package splite;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.StringTokenizer;

/**
 *
 * @author zys59三仙半（QQ：597882752）<br>
 *         创建时间：2015年6月15日 上午8:28:29
 */
public class SplitTest {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		String str = null;
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < 1000; i++) {
			sb.append(i);
			sb.append(";");
		}
		str = sb.toString();

		SplitTest test = new SplitTest();

		// 使用split分割(61ms)80
		long s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			str.split(";");
		}
		long e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 使用StringTokenizer类分割(73ms)93
		s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			test.strTokenizer(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 使用indexOf和substring手工编码方式分割（13548ms）12540
		s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			test.strIndexOf(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 55ms//48
		s = Calendar.getInstance().getTimeInMillis();
		String[] r = null;
		for (int i = 0; i < 1000; i++) {
			r = test.mySplite(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);
		//这块是验证一下mySplite()方法的正确性
		System.out.println("=========\n分割后项数：" + r.length);
		for (int i = 0; i < r.length; i++) {
			System.out.println(r[i] + "\t");
		}

	}

	/**
	 * 分割字符串。
	 * 
	 * @param src
	 * @return
	 */
	public String[] mySplite(String src) {
		ArrayList<String> tmp = new ArrayList<String>();
		int s = 0;// ,e=0
		for (int i = 0; i < src.length(); i++) {
			if (src.charAt(i) == ';') {
				tmp.add(src.substring(s, i));
				s = i + 1;
			}
		}
		String[] result = new String[tmp.size()];
		return tmp.toArray(result);
	}

	/**
	 * 使用split分割
	 */
	public void strSplit(String str) {
		// for (int i = 0; i < 10; i++) {
		str.split(";");
		// }
	}

	/**
	 * 使用StringTokenizer类分割
	 */
	public void strTokenizer(String str) {
		StringTokenizer st = new StringTokenizer(str, ";");
		// for (int i = 0; i < 10; i++) {
		while (st.hasMoreTokens())
			st.nextToken();
		st = new StringTokenizer(str, ";");
		// }
	}

	/**
	 * 使用indexOf和substring手工编码方式分割
	 */
	public void strIndexOf(String str) {
		String tmp = str;
		for (int i = 0; i < 10; i++) {
			while (true) {
				int idx = tmp.indexOf(';');
				if (idx < 0)
					break;
				tmp = new String(tmp.substring(idx + 1));
			}
			tmp = str;
		}
	}
}

我没去看API的源代码，猜测一下，strTokenizer()和split()方法跟我的实现方式应该是一样的，只是它们需要考虑的因素要多一些，也就多一些判断，所以，比mySplite()稍微慢一点儿。而strIndexOf()慢，不是因为indexOf()，而是反复使用了new String()。代码测试是没有问题的，原因是纯猜测，欢迎大家批评，嘿嘿。

public String[] mySplite(String src) 是充分利用了优势啊

三仙半 2015-06-15

打赏
举报

不好意思，strIndexOf()多运行了9000次

结果应该是split()-->67; strTokenizer()-->84; strIndexOf()-->1269; mySplite()-->47

三仙半 2015-06-15

打赏
举报


package splite;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.StringTokenizer;

/**
 *
 * @author zys59三仙半（QQ：597882752）<br>
 *         创建时间：2015年6月15日 上午8:28:29
 */
public class SplitTest {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		String str = null;
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < 1000; i++) {
			sb.append(i);
			sb.append(";");
		}
		str = sb.toString();

		SplitTest test = new SplitTest();

		// 使用split分割(61ms)80
		long s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			str.split(";");
		}
		long e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 使用StringTokenizer类分割(73ms)93
		s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			test.strTokenizer(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 使用indexOf和substring手工编码方式分割（13548ms）12540
		s = Calendar.getInstance().getTimeInMillis();
		for (int i = 0; i < 1000; i++) {
			test.strIndexOf(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);

		// 55ms//48
		s = Calendar.getInstance().getTimeInMillis();
		String[] r = null;
		for (int i = 0; i < 1000; i++) {
			r = test.mySplite(str);
		}
		e = Calendar.getInstance().getTimeInMillis();
		System.out.println(e - s);
		//这块是验证一下mySplite()方法的正确性
		System.out.println("=========\n分割后项数：" + r.length);
		for (int i = 0; i < r.length; i++) {
			System.out.println(r[i] + "\t");
		}

	}

	/**
	 * 分割字符串。
	 * 
	 * @param src
	 * @return
	 */
	public String[] mySplite(String src) {
		ArrayList<String> tmp = new ArrayList<String>();
		int s = 0;// ,e=0
		for (int i = 0; i < src.length(); i++) {
			if (src.charAt(i) == ';') {
				tmp.add(src.substring(s, i));
				s = i + 1;
			}
		}
		String[] result = new String[tmp.size()];
		return tmp.toArray(result);
	}

	/**
	 * 使用split分割
	 */
	public void strSplit(String str) {
		// for (int i = 0; i < 10; i++) {
		str.split(";");
		// }
	}

	/**
	 * 使用StringTokenizer类分割
	 */
	public void strTokenizer(String str) {
		StringTokenizer st = new StringTokenizer(str, ";");
		// for (int i = 0; i < 10; i++) {
		while (st.hasMoreTokens())
			st.nextToken();
		st = new StringTokenizer(str, ";");
		// }
	}

	/**
	 * 使用indexOf和substring手工编码方式分割
	 */
	public void strIndexOf(String str) {
		String tmp = str;
		for (int i = 0; i < 10; i++) {
			while (true) {
				int idx = tmp.indexOf(';');
				if (idx < 0)
					break;
				tmp = new String(tmp.substring(idx + 1));
			}
			tmp = str;
		}
	}
}

云想慕尘 2015-06-13

打赏
举报

引用 14 楼 zys59 的回复:

既然要自己分割，就别用indexOf()了，设置两个变量s和e，s初值为0，对字符串从头走到尾，找到一个;就设置e为那个位置减一，取子串，然后s=e+2，一趟循环解决问题。

不懂，愿闻其详，主要是我的目的不是为了分割字符串，只是对效率问题有疑问

三仙半 2015-06-13

打赏
举报

既然要自己分割，就别用indexOf()了，设置两个变量s和e，s初值为0，对字符串从头走到尾，找到一个;就设置e为那个位置减一，取子串，然后s=e+2，一趟循环解决问题。

云想慕尘 2015-06-13

打赏
举报

引用 7 楼 dcxy0 的回复:

[quote=引用 6 楼 cndotaci 的回复:] [quote=引用 2 楼 dcxy0 的回复:] 第三种方法会反复去定位位置，然后才截取。大概是这样。

3种方法的耗时如下： splitTest:17781997 tokenizerTest:21467545 indexOfTest:31145211 不晓得我第三种方法的写法中是不是由不合理的地方[/quote] 但是split是用的正则，按理说正则的查找速度会比indexof快一些而且split的最后返回也是通过正则的分割的、。如果你调试一下，你可以看下，split可能都没有经过判断，而是直接运行到最后一步了。；[/quote] 有判断的，而且里面的while循环也有运行，继续调试下去很复杂，不太看得懂

youzi05 2015-06-13

打赏
举报

首先

tmp = new String(tmp.substring(idx + 1));

得到的


2;3;4;5;
3;4;5;
4;5;
5;

而你想要的应该是

1;
2;
3;
4;
5;

也就是说你的结果根本不符合你的要求, 其次, 由于下面这行代码:

tmp = new String(tmp.substring(idx + 1));

产生的临时变量更费内存, 所以需要更多的时间去分配内存, 还有可能会产生更频繁的换页, 导致效率低下, 这个仅仅是有点可能吧, 因为这些产生的临时变量很快就会被丢弃, 所以可能在换页之前就被回收了, 也可能换页是把那些还没回收的变量换出去, 不见得是把那些有用的变量换出去,,,

云想慕尘 2015-06-13

打赏
举报

引用 9 楼 u011004037 的回复:

tmp = new String(tmp.substring(旧的idx, idx + 1));

1;2;3;4;5; while循环第一次循环时，把1分割掉，剩下2;3;4;5;，继续下一个循环第二次循环时，把2分割掉，剩下3;4;5;，继续下一个循环 …… 直到最后一个字符串中没有”;“

youzi05 2015-06-13

打赏
举报

引用 9 楼 u011004037 的回复:

tmp = new String(tmp.substring(旧的idx, idx + 1));

好吧, 可能又错了, 应该是:

tmp = new String(tmp.substring(旧的idx+1, idx));

没有测试, 应该是这样的

youzi05 2015-06-13

打赏
举报

tmp = new String(tmp.substring(旧的idx, idx + 1));

youzi05 2015-06-13

打赏
举报

tmp = new String(tmp.substring(idx + 1));

是干嘛呢?不是截取吗?那不该是

tmp = new String(tmp.substring(久的idx, idx + 1));

JPF1024 2015-06-13

打赏
举报

引用 6 楼 cndotaci 的回复:

[quote=引用 2 楼 dcxy0 的回复:] 第三种方法会反复去定位位置，然后才截取。大概是这样。

云想慕尘 2015-06-13

打赏
举报

引用 2 楼 dcxy0 的回复:

第三种方法会反复去定位位置，然后才截取。大概是这样。

3种方法的耗时如下： splitTest:17781997 tokenizerTest:21467545 indexOfTest:31145211 不晓得我第三种方法的写法中是不是由不合理的地方

云想慕尘 2015-06-13

打赏
举报

引用 2 楼 dcxy0 的回复:

第三种方法会反复去定位位置，然后才截取。大概是这样。

String的split()方法也是用了indexof()和substring()方法进行分割的，只不过还多了一些判断，还由返回值的部分，按理讲其实现过程要比上面写的第3个方法耗时更久才对，附split()方法源码

public String[] split(String regex, int limit) {
        /* fastpath if the regex is a
         (1)one-char String and this character is not one of the
            RegEx's meta characters ".$|()[{^?*+\\", or
         (2)two-char String and the first char is the backslash and
            the second is not the ascii digit or ascii letter.
         */
        char ch = 0;
        if (((regex.value.length == 1 &&
             ".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
             (regex.length() == 2 &&
              regex.charAt(0) == '\\' &&
              (((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
              ((ch-'a')|('z'-ch)) < 0 &&
              ((ch-'A')|('Z'-ch)) < 0)) &&
            (ch < Character.MIN_HIGH_SURROGATE ||
             ch > Character.MAX_LOW_SURROGATE))
        {
            int off = 0;
            int next = 0;
            boolean limited = limit > 0;
            ArrayList<String> list = new ArrayList<>();
            while ((next = indexOf(ch, off)) != -1) {
                if (!limited || list.size() < limit - 1) {
                    list.add(substring(off, next));
                    off = next + 1;
                } else {    // last one
                    //assert (list.size() == limit - 1);
                    list.add(substring(off, value.length));
                    off = value.length;
                    break;
                }
            }
            // If no match was found, return this
            if (off == 0)
                return new String[]{this};

            // Add remaining segment
            if (!limited || list.size() < limit)
                list.add(substring(off, value.length));

            // Construct result
            int resultSize = list.size();
            if (limit == 0)
                while (resultSize > 0 && list.get(resultSize - 1).length() == 0)
                    resultSize--;
            String[] result = new String[resultSize];
            return list.subList(0, resultSize).toArray(result);
        }
        return Pattern.compile(regex).split(this, limit);
    }