关于hash_map的时间消耗问题

志士惜日短，愁人知夜长️ 2014-04-11 04:47:00

写了个程序，有两个步骤
a、遍历某个文件夹中的所有文件，通过对这些文件进行md5取值，保存到输出文件中。
b、当这个文件夹中的某一个文件被修改了，那么输出出来这个文件的文件名、原来的md5值和新的md5值

看这个图：

这些都是进行的是md5匹配操作，也就是上面描述的第二步
a、左上图使用的方式是：
首先得到文件中的md5值(上面描述的第一步)保存到一个hash_map中，每次在对文件夹中的一个文件进行求md5值的时候，遍历这个hash_map，先匹配文件路径(包含文件名)，再匹配md5的值，发现不一样，输出出来，发现一样的，break；

b、右上图使用的方式是：
首先得到文件中的md5值(上面描述的第一步)保存到一个hash_map中，再重新对现在的这个文件夹中的所有文件进行md5取值，输出到临时文件中，再从这个文件中读取数据，保存到另一个hash_map中，最后对两个hash_map中的数据进行匹配。

c、左下图
是b方式的空转

d、右下图
使用b方式时候总的消耗时间是148782ms，但是光是消耗在两个hash_map的时间就是144384ms了。
这是两个hash_map中的匹配操作：



void TraversalOperator_impl::Md5Match()

{

	// 先从md5_match_tmp_file.txt文件中获取到临时的md5值，存放到current_hash_info_中

	GetAllFileMd5ValueFromFile(current_hash_info_, MD5_MATCH_TMP_FILE);

	// 再去和file_hash_info_中的数据进行比较

	hash_map<string, string>::iterator it_file_hash_info_;

	hash_map<string, string>::iterator it_current_hash_info_;

	for (it_current_hash_info_ = current_hash_info_.begin(); it_current_hash_info_ != current_hash_info_.end(); it_current_hash_info_++)

	{

		for (it_file_hash_info_ = file_hash_info_.begin(); it_file_hash_info_ != file_hash_info_.end(); it_file_hash_info_++)

		{

			// 找到key

			if (!it_file_hash_info_->first.compare(it_current_hash_info_->first))

			{

				// 匹配value

				if (!it_file_hash_info_->second.compare(it_current_hash_info_->second))

				{

					break;

				}

				else

				{

					cout << it_current_hash_info_->first << " original md5 value is : "  << it_file_hash_info_->second;

					cout << "   ==============   new md5 value is : " << it_current_hash_info_->second << endl;

					cout << endl;

				}

				break;

			}

		}

	}

}

通过网上资料得知，消耗的时候主要是在begin()中
讨论一下，这要怎么改，速度能更快？换成方式1？或者有更好的？

弱弱的感觉，鸡肋就出在这个匹配上面，无论是1和2都是消耗的罪魁祸首。

...全文

175 4 打赏收藏转发到动态举报

写回复

用AI写文章

4 条回复

切换为时间正序

请发表友善的回复…

发表回复

志士惜日短，愁人知夜长️ 2014-04-15

打赏
举报

一个测试程序：



#include "stdafx.h"

#include "iostream"

#include "fstream"

#include "string"

#include "sstream"

using namespace std;

#include "unordered_map"

#include "time.h"





#define  BEGINE_GET_TIME clock_t start_time = clock();

#define  ENG_GET_TIME    clock_t end_time = clock();



#define  CONSOLE_TIME    cout << "Running time is: " << static_cast<double>(end_time - start_time) / CLOCKS_PER_SEC * 1000 << "ms" << endl; //输出运行时间





int _tmain(int argc, _TCHAR* argv[])

{

	unordered_map<string, string> test;

	stringstream ss;

	string tmp_key;

	string tmp_value;

	for (int index = 0; index < 10001; index++)

	{

		ss << "zeng"; ss << index;

		ss >> tmp_key;

		ss.clear();

		ss << "zengraoli"; ss << index;

		ss >> tmp_value;

		ss.clear();

		test.insert(pair<string, string>(tmp_key, tmp_value));

		ss.str("");

	}



// 	BEGINE_GET_TIME;

// 	// 先来看看遍历的速度

// 	for (int count = 0; count < 10000; count++)

// 	{

// 	 	unordered_map<string, string>::iterator it;

// 	 	for (it = test.begin(); it != test.end(); it++)

// 	 	{

// 	 		if (it->first == "zeng10000")

// 	 		{

// 	 			if (it->second == "zengraoli10000")

// 	 			{

// 	 		//		cout << "find" << endl;

// 	 			}

// 	 		}

// 	 	}

// 	}

// 	ENG_GET_TIME;

// 	CONSOLE_TIME;







	BEGINE_GET_TIME;

	// 先来看看遍历的速度

	for (int count = 0; count < 10000; count++)

	{

		unordered_map<string, string>::iterator it;

		it = test.find("zeng10000");

		if (it == test.end())

		{

			cout << "not find" << endl;

		}

		else

		{

			if (it->second == "zengraoli10000")

			{

				//	cout << "find" << endl;

			}

		}

	}

	ENG_GET_TIME;

	CONSOLE_TIME;





	return 0;

}