千万级数据库(MSSQL)删除重复记录

luckycynthia 2010-06-21 11:46:31

在抓取数据后对数据进行操作的途中，有时候会碰到重复数据，重复数据有时候会导致了数据库部分设置不能正确设置，所以就要进行筛选。

首先，有两个意义上的重复记录，一是完全重复的记录，也即所有字段均重复的记录，二是部分关键字段重复的记录，比如Name字段重复，而其他字段不一定重复或都重复可以忽略。

对于第一种重复，比较容易解决，使用：
　　select distinct * from tableName
就可以得到无重复记录的结果集。
如果该表需要删除重复的记录（重复记录保留1条），可以按以下方法删除：
select distinct * into #Tmp from tableName
drop table tableName
select * into tableName from #Tmp
drop table #Tmp

对于第二种重复，我们需要使用以下这种方法，假设重复字段为Mobile和UnitName：
select * from tableName
where [Mobile] in (select [Mobile] from info_user_a group by [Mobile] having count([Mobile])>1)
and id not in (select min(id) from info_user_a group by [Mobile] having count([Mobile])>1)
and [UnitName] in (select [UnitName] from info_user_a group by [UnitName] having count([UnitName])>1)
and id not in (select min(id) from info_user_a group by [UnitName] having count([UnitName])>1)
这样是得到了重复的数据，如果想要删除，只需把开头的select *改成delete即可。

接下来是重点，如果数据库真的有一千万，或者上千万，千万别直接对整个库进行操作，好点的办法就是把一个库分成多个，比如原先有1000W，就分成10个100W，然后分别对这10个100W的库操作，不过如果真的有上千万的数据，建议还是改用oracle数据库比较好，我用sql server 2008感觉上都有点吃不消，而且还是双核酷睿CPU，可能是内存有点小，只有2G。

文章转载于商企通官方网站：www.caiyufu.com

...全文

463 17 打赏收藏转发到动态举报

写回复

用AI写文章

17 条回复

切换为时间正序

请发表友善的回复…

发表回复

橡胶轮胎行业数字化高总 2010-07-06

打赏
举报

--处理表重复记录(查询和删除)

/******************************************************************************************************************************************************

1、Num、Name相同的重复值记录,没有大小关系只保留一条

2、Name相同,ID有大小关系时,保留大或小其中一个记录

整理人：中国风(Roy)



日期:2008.06.06

******************************************************************************************************************************************************/



--1、用于查询重复处理记录(如果列没有大小关系时2000用生成自增列和临时表处理,SQL2005用row_number函数处理)



--> --> (Roy)生成測試數據

 

if not object_id('Tempdb..#T') is null

    drop table #T

Go

Create table #T([ID] int,[Name] nvarchar(1),[Memo] nvarchar(2))

Insert #T

select 1,N'A',N'A1' union all

select 2,N'A',N'A2' union all

select 3,N'A',N'A3' union all

select 4,N'B',N'B1' union all

select 5,N'B',N'B2'

Go





--I、Name相同ID最小的记录(推荐用1,2,3),方法3在SQl05时，效率高于1、2

方法1:

Select * from #T a where not exists(select 1 from #T where Name=a.Name and ID<a.ID)



方法2:

select a.* from #T a join (select min(ID)ID,Name from #T group by Name) b on a.Name=b.Name and a.ID=b.ID



方法3:

select * from #T a where ID=(select min(ID) from #T where Name=a.Name)



方法4:

select a.* from #T a join #T b on a.Name=b.Name and a.ID>=b.ID group by a.ID,a.Name,a.Memo having count(1)=1 



方法5:

select * from #T a group by ID,Name,Memo having ID=(select min(ID)from #T where Name=a.Name)



方法6:

select * from #T a where (select count(1) from #T where Name=a.Name and ID<a.ID)=0



方法7:

select * from #T a where ID=(select top 1 ID from #T where Name=a.name order by ID)



方法8:

select * from #T a where ID!>all(select ID from #T where Name=a.Name)



方法9(注:ID为唯一时可用):

select * from #T a where ID in(select min(ID) from #T group by Name)



--SQL2005:



方法10:

select ID,Name,Memo from (select *,min(ID)over(partition by Name) as MinID from #T a)T where ID=MinID



方法11:



select ID,Name,Memo from (select *,row_number()over(partition by Name order by ID) as MinID from #T a)T where MinID=1



生成结果:

/*

ID          Name Memo

----------- ---- ----

1           A    A1

4           B    B1



(2 行受影响)

*/





--II、Name相同ID最大的记录,与min相反:

方法1:

Select * from #T a where not exists(select 1 from #T where Name=a.Name and ID>a.ID)



方法2:

select a.* from #T a join (select max(ID)ID,Name from #T group by Name) b on a.Name=b.Name and a.ID=b.ID order by ID



方法3:

select * from #T a where ID=(select max(ID) from #T where Name=a.Name) order by ID



方法4:

select a.* from #T a join #T b on a.Name=b.Name and a.ID<=b.ID group by a.ID,a.Name,a.Memo having count(1)=1 



方法5:

select * from #T a group by ID,Name,Memo having ID=(select max(ID)from #T where Name=a.Name)



方法6:

select * from #T a where (select count(1) from #T where Name=a.Name and ID>a.ID)=0



方法7:

select * from #T a where ID=(select top 1 ID from #T where Name=a.name order by ID desc)



方法8:

select * from #T a where ID!<all(select ID from #T where Name=a.Name)



方法9(注:ID为唯一时可用):

select * from #T a where ID in(select max(ID) from #T group by Name)



--SQL2005:



方法10:

select ID,Name,Memo from (select *,max(ID)over(partition by Name) as MinID from #T a)T where ID=MinID



方法11:

select ID,Name,Memo from (select *,row_number()over(partition by Name order by ID desc) as MinID from #T a)T where MinID=1



生成结果2:

/*

ID          Name Memo

----------- ---- ----

3           A    A3

5           B    B2



(2 行受影响)

*/







--2、删除重复记录有大小关系时,保留大或小其中一个记录





--> --> (Roy)生成測試數據



if not object_id('Tempdb..#T') is null

    drop table #T

Go

Create table #T([ID] int,[Name] nvarchar(1),[Memo] nvarchar(2))

Insert #T

select 1,N'A',N'A1' union all

select 2,N'A',N'A2' union all

select 3,N'A',N'A3' union all

select 4,N'B',N'B1' union all

select 5,N'B',N'B2'

Go



--I、Name相同ID最小的记录(推荐用1,2,3),保留最小一条

方法1:

delete a from #T a where  exists(select 1 from #T where Name=a.Name and ID<a.ID)



方法2:

delete a  from #T a left join (select min(ID)ID,Name from #T group by Name) b on a.Name=b.Name and a.ID=b.ID where b.Id is null



方法3:

delete a from #T a where ID not in (select min(ID) from #T where Name=a.Name)



方法4(注:ID为唯一时可用):

delete a from #T a where ID not in(select min(ID)from #T group by Name)



方法5:

delete a from #T a where (select count(1) from #T where Name=a.Name and ID<a.ID)>0



方法6:

delete a from #T a where ID<>(select top 1 ID from #T where Name=a.name order by ID)



方法7:

delete a from #T a where ID>any(select ID from #T where Name=a.Name)







select * from #T



生成结果:

/*

ID          Name Memo

----------- ---- ----

1           A    A1

4           B    B1



(2 行受影响)

*/





--II、Name相同ID保留最大的一条记录:



方法1:

delete a from #T a where  exists(select 1 from #T where Name=a.Name and ID>a.ID)



方法2:

delete a  from #T a left join (select max(ID)ID,Name from #T group by Name) b on a.Name=b.Name and a.ID=b.ID where b.Id is null



方法3:

delete a from #T a where ID not in (select max(ID) from #T where Name=a.Name)



方法4(注:ID为唯一时可用):

delete a from #T a where ID not in(select max(ID)from #T group by Name)



方法5:

delete a from #T a where (select count(1) from #T where Name=a.Name and ID>a.ID)>0



方法6:

delete a from #T a where ID<>(select top 1 ID from #T where Name=a.name order by ID desc)



方法7:

delete a from #T a where ID<any(select ID from #T where Name=a.Name)





select * from #T

/*

ID          Name Memo

----------- ---- ----

3           A    A3

5           B    B2



(2 行受影响)

*/











--3、删除重复记录没有大小关系时，处理重复值





--> --> (Roy)生成測試數據

 

if not object_id('Tempdb..#T') is null

    drop table #T

Go

Create table #T([Num] int,[Name] nvarchar(1))

Insert #T

select 1,N'A' union all

select 1,N'A' union all

select 1,N'A' union all

select 2,N'B' union all

select 2,N'B'

Go



方法1:

if object_id('Tempdb..#') is not null

    drop table #

Select distinct * into # from #T--排除重复记录结果集生成临时表#



truncate table #T--清空表



insert #T select * from #    --把临时表#插入到表#T中



--查看结果

select * from #T



/*

Num         Name

----------- ----

1           A

2           B



(2 行受影响)

*/



--重新执行测试数据后用方法2

方法2:



alter table #T add ID int identity--新增标识列

go

delete a from  #T a where  exists(select 1 from #T where Num=a.Num and Name=a.Name and ID>a.ID)--只保留一条记录

go

alter table #T drop column ID--删除标识列



--查看结果

select * from #T



/*

Num         Name

----------- ----

1           A

2           B



(2 行受影响)



*/



--重新执行测试数据后用方法3

方法3:

declare Roy_Cursor cursor local for

select count(1)-1,Num,Name from #T group by Num,Name having count(1)>1

declare @con int,@Num int,@Name nvarchar(1)

open Roy_Cursor

fetch next from Roy_Cursor into @con,@Num,@Name

while @@Fetch_status=0

begin 

    set rowcount @con;

    delete #T where Num=@Num and Name=@Name

    set rowcount 0;

    fetch next from Roy_Cursor into @con,@Num,@Name

end

close Roy_Cursor

deallocate Roy_Cursor



--查看结果

select * from #T

/*

Num         Name

----------- ----

1           A

2           B



(2 行受影响)

*/

pumaadamsjack 2010-07-05

打赏
举报

同意4楼的，这个数量级删除对机器台残忍了。
建一个相同结构的表，将不重复的字段建唯一索引，然后设置遇到重复的处理办法（忽略）
这样比删除要快。而且新表是有索引的。

旅行者I号 2010-06-29

打赏
举报

[Quote=引用 9 楼 claro 的回复:]

1、非完全字段的重复数据del
可以使用exists更快捷且高效，举例：
假设重复字段为Mobile和UnitName：
select *
--delete
from tableName a
where exists (
select *
from tableName b
where a.Mobile = b.Mobile and a.Uni……
[/Quote]

（1）中会不会造成abca的集合结果只得到bc，就是有重复导致所有全删掉
（2）的思路记住了

yanjiexixu 2010-06-28

打赏
举报

学习。。。。。。。。

domydream 2010-06-27

打赏
举报

搞笑吧，sqlserver千万数据都支持不了?首先是你硬件的问题了，sqlserver别的我没玩过，至少10亿数据的一个表，没有任何问题

流氓兔 2010-06-27

打赏
举报

大哥PC和Server可是不同的啊，别看配置差不了多少！！！！！！！！！！！！！

宇峰科技 2010-06-22

打赏
举报

又是HL

chuifengde 2010-06-21

打赏
举报

建唯一索引不让插不就行了

永生天地 2010-06-21

打赏
举报

[Quote=引用楼主 luckycynthia 的回复:]
建议还是改用oracle数据库比较好，我用sql server 2008感觉上都有点吃不消，而且还是双核酷睿CPU，可能是内存有点小，只有2G。
[/Quote]
就这个机器，用oracle，估计也不行

albert_sky 2010-06-21

打赏
举报

MARK

Mr_Nice 2010-06-21

打赏
举报

学习...

claro 2010-06-21

打赏
举报

最后建议给大家介绍一些专业的网站，比如微软MSSQL的开发人员以及知名度较高的论坛。
如果有兴趣可以在me的微博上找到。

claro 2010-06-21

打赏
举报

有点晕。

纠正两个问题：
1、非完全字段的重复数据del
可以使用exists更快捷且高效，举例：
假设重复字段为Mobile和UnitName：
select *
--delete
from tableName a
where exists (
select *
from tableName b
where a.Mobile = b.Mobile and a.UnitName = b.UnitName
and a.其他字段 <> b.其他字段)

2、真正需要在一个生产数据库中del大量数据的可能性相对较小，如果真是这样，相信也没有多少时间让做间隔数据。
实际上仔细观察过MSSQL在处理千万级的table修改索引建的默认值或其他信息时，是这样处理：
首先生成当前表的#临时表，table名一模一样，用select * into #table的方式；
其次drop table 当前表，再新建符合要求的新当前表；
最后insert into 新当前表 select * from #table 的方式回导数据。
回到刚才的话题，如果真的是del大量数据，不妨借鉴筛选数据再回导的方式。