# Python 快速统计数据的去重数和去重数据

wv,x1777856823 2021-04-14 03:03:51

data_unique = {}.fromkeys(data).keys()
len(data_unique)

In [1]: import random

In [2]: data = [random.randint(0, 1000) for _ in xrange(1000000)]

In [3]: %timeit len(set(data))
10 loops, best of 3: 39.7 ms per loop

In [4]: %timeit len({}.fromkeys(data).keys())
10 loops, best of 3: 43.5 ms per loop

import collections
import random as py_random
import timeit

import numpy.random as np_random
import pandas as pd

DATA_SIZE = 10000000

def py_cal_len():
data = [py_random.randint(0, 1000) for _ in xrange(DATA_SIZE)]
len(set(data))

def pd_cal_len():
data = np_random.randint(1000, size=DATA_SIZE)
data = pd.Series(data)
data_unique = data.value_counts()
data_unique.size

def py_count():
data = [py_random.randint(0, 1000) for _ in xrange(DATA_SIZE)]
collections.Counter(data)

def pd_count():
data = np_random.randint(1000, size=DATA_SIZE)
data = pd.Series(data)
data.value_counts()

# Script starts from here

if __name__ == "__main__":
t1 = timeit.Timer("py_cal_len()", setup="from __main__ import py_cal_len")
t2 = timeit.Timer("pd_cal_len()", setup="from __main__ import pd_cal_len")
t3 = timeit.Timer("py_count()", setup="from __main__ import py_count")
t4 = timeit.Timer("pd_count()", setup="from __main__ import pd_count")

print t1.timeit(number=1)
print t2.timeit(number=1)
print t3.timeit(number=1)
print t4.timeit(number=1)

12.438587904
0.435907125473
14.6431810856
0.258564949036

...全文
36 回复 打赏 收藏 举报

238

2021-04-14 03:03