删除重复数据。

Generics 2007-09-28 01:57:03

我有一个表，数据可能比较多，是一个根据距离的多对多的影射，　其中包括有三项，　ID1　，　ID2　和　dist, 要求根据dist(距离），　删除ID1，　ID2两列里任何重复的ID，　最后变成一个一对一的影射（有些ID可能会没有匹配。）

譬如　
create table TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)

insert TA
select 10001, 10001, 0.0
UNION ALL select 10002, 10003, 1.0
UNION ALL select 10003, 10004, 3.0
UNION ALL select 10004, 10005, 10.0
UNION ALL select 10008, 10007, 0.5
UNION ALL select 10005, 10003, 0.8
UNION ALL select 10007, 10002, 0.2
UNION ALL select 10009, 10008, 7.0
UNION ALL select 10008, 10006, 0.8
UNION ALL select 10006, 10007, 0.51

最后结果应该是

10001, 10001, 0
10007, 10002, 0.2
10008, 10007, 0.5
10005, 10003, 0.8
10003, 10004, 3
10009, 10008, 7
10004, 10005, 10

...全文

4017 45 打赏收藏转发到动态举报

写回复

用AI写文章

45 条回复

切换为时间正序

请发表友善的回复…

发表回复

kelelipeng 2007-11-28

打赏
举报

---------------问题解决方法------------------
create table ##TA(id int identity(1,1), ID1 numeric(18,0), ID2 numeric(18,0), dist float)
insert ##TA(id1,id2,dist)
select 10001, 10001, 0.0
UNION ALL
select 10002, 10003, 1.0
UNION ALL
select 10003, 10004, 3.0
UNION ALL
select 10004, 10005, 10.0
UNION ALL
select 10008, 10007, 0.5
UNION ALL
select 10005, 10003, 0.8
UNION ALL
select 10007, 10002, 0.2
UNION ALL
select 10009, 10008, 7.0
UNION ALL
select 10008, 10006, 0.8
UNION ALL
select 10006, 10007, 0.51

delete from ##TA
where id in
(
select id from ##TA as a ,
(select max(id2) as id2 ,max(dist) as dist from ##TA group by id2 having count(id2)>1) as b
where a.id2=b.id2 and a.dist=b.dist
union
select id from ##TA as a ,
(select max(id1) as id1 ,max(dist) as dist from ##TA group by id1 having count(id1)>1) as b
where a.id1=b.id1 and a.dist=b.dist
)
------------------------------------------

javalzn 2007-11-18

打赏
举报

1、建立存储过程
drop procedure sp_ta
go

create procedure sp_ta
as
begin
create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)

create table #TA_DEL(ID1 numeric(18,0), ID2 numeric(18,0), dist float)

insert into #TA
select * from TA A where not exists (select 1 from TA B where B.ID1=A.ID1 and A.dist> B.dist)
and not exists (select 1 from TA C where C.ID2=A.ID2 and A.dist> C.dist)

insert into #TA_DEL
select * from TA A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)

insert into #TA
select * from #TA_DEL A where not exists (select 1 from #TA_DEL B where B.ID1=A.ID1 and A.dist> B.dist) free asp host
and not exists (select 1 from #TA_DEL C where C.ID2=A.ID2 and A.dist> C.dist)

insert into #TA
select * from #TA_DEL A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)

select * from #TA order by dist

end
go

2、给表TA增加索引
create index idx_ta_id1 on TA(ID1)
go
create index idx_ta_id2 on TA(ID2)
go

3、运行存储过程返回结果
exec sp_ta
go

badboy_2007 2007-11-17

打赏
举报

看不太明白，但我知道数据库一个表达到1千万条记录的时候搜索一条需要耗时26秒。（奔四双核机器上）

marskisl 2007-11-09

打赏
举报

慢慢吸收!谢谢!

wzxhm 2007-10-29

打赏
举报

强烈要求楼主组织大数据量的结果测试(记录数达10万以上),并把测试结果告诉大家,看看那种办法正确,速度快.
数据量太小了,测试效果不明显.

cadenza7 2007-10-25

打赏
举报

早晨上班时看了下觉得挺有意思的,研究了一天,已经测试通过,不知道是不是你要的效果.

create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)
insert #TA
select 10001, 11101, 0.01
union all select 10001, 11102, 0.21
--union all select 10001, 11103, 0.31
--union all select 10001, 11104, 0.41
union all select 10002, 11101, 0.12
union all select 10002, 11102, 0.32
union all select 10002, 11103, 0.52
union all select 10002, 11104, 0.72
union all select 10003, 11101, 0.23
union all select 10003, 11102, 0.43
union all select 10003, 11103, 0.63
--union all select 10003, 11104, 0.83
union all select 10004, 11101, 0.64
union all select 10004, 11102, 0.44
union all select 10004, 11103, 0.24
union all select 10004, 11104, 0.14

/*
display result
10001 11101 0.01
10004 11104 0.14
10002 11102 0.32
10003 11103 0.63
*/

--创建表并插入一条数据
select top 1 * into #Temp from #TA order by ID1,ID2

--也许你要插入的数据是以下这些情况
--select top 1 * into #Temp from #TA order by ID1 asc,ID2 asc
--select top 1 * into #Temp from #TA order by ID1 asc,ID2 desc
--select top 1 * into #Temp from #TA order by ID1 desc,ID2 asc
--select top 1 * into #Temp from #TA order by ID1 desc,ID2 desc
--select top 1 * into #Temp from #TA where ID1='xxx' and ID2='xxx'
--select top 1 * into #Temp from #TA where ID1='10001' and ID2='11102'

declare @ID1Count int
declare @ID2Count int
declare @i int
declare @Records int

declare @TopID1 int
declare @TopID2 int

set @TopID1=1
set @TopID2=1
set @i=0

set @ID1Count=(select count(distinct ID1) from #TA)
set @ID2Count=(select count(distinct ID2) from #TA)

set @Records=(select max(MaxNum) from(select @ID1Count MaxNum union all select @ID2Count) as asMaxNum)

while(@i<@Records)
begin

insert #Temp select top 1 * from #TA where ID1 not in(select top (select @TopID1) ID1 from #Temp order by ID1) and ID2 not in(select top (select @TopID2) ID2 from #Temp order by ID2)

set @i=@i+1
set @TopID1=@TopID1+1
set @TopID2=@TopID2+1

end

/*
方法一
*/
delete #TA
insert #TA select * from #Temp

/*
方法二
使用此方法时请将#TA,#Temp改为TA,Temp
*/
--drop Table TA
--select * into TA from Temp

select * from #TA order by dist

drop Table #TA
Drop Table #Temp

Generics 2007-10-13

打赏
举报

谢谢. 不过我已经give up了, 原来只是想看看有没有简单快速的query可以做这事, 不过现在看来, 这种问题看来还是直接写C#/VB程序好一点.

wzxhm 2007-10-13

打赏
举报

答案应该没有问题了，不知道效率怎样？
把测试结果告诉我好吗？或则发我几万条测试数据，我测测看．

wzxhm 2007-10-11

打赏
举报

检查了一下，确实有问题，事实上我在答复时就有担心，但我不想在存储过程中用循环，看来没有办法，必须得用，现在修改如下：

1、建立存储过程
drop procedure sp_ta
go

create procedure sp_ta
as
begin
create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)
create index idx_temp_ta_id1 on #TA(ID1)
create index idx_temp_ta_id2 on #TA(ID2)

create table #TA_DEL(ID1 numeric(18,0), ID2 numeric(18,0), dist float)
create index idx_temp_ta_del_id1 on #TA_DEL(ID1)
create index idx_temp_ta_del_id2 on #TA_DEL(ID2)

insert into #TA
select * from TA A where not exists (select 1 from TA B where B.ID1=A.ID1 and A.dist >B.dist)
and not exists (select 1 from TA C where C.ID2=A.ID2 and A.dist >C.dist)

insert into #TA_DEL
select * from TA A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)

While exists(select 1 from #TA_DEL)
Begin
insert into #TA
select * from #TA_DEL A where not exists (select 1 from #TA_DEL B where B.ID1=A.ID1 and A.dist >B.dist)
and not exists (select 1 from #TA_DEL C where C.ID2=A.ID2 and A.dist >C.dist)

delete from #TA_DEL

insert into #TA_DEL
select * from TA A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)
End

select * from #TA order by dist
end
go

2、给表TA增加索引
create index idx_ta_id1 on TA(ID1)
go
create index idx_ta_id2 on TA(ID2)
go

3、运行存储过程返回结果
exec sp_ta
go

Generics 2007-10-10

打赏
举报

我发现你的方法根本就是不对的！你的方法专门就是针对我的那个特定例子而已，　数据一多，　你的方法就乱出结果了。你试试这个例子：

create   table   TA(ID1   numeric(18,0),   ID2   numeric(18,0),   dist   float) 



insert   TA 

select 10001, 11101, 0.01

union all select 10001, 11102, 0.21

union all select 10001, 11103, 0.31

union all select 10001, 11104, 0.41

union all select 10001, 11105, 0.36

union all select 10002, 11101, 0.12

union all select 10002, 11102, 0.32

union all select 10002, 11103, 0.52

union all select 10002, 11104, 0.72

union all select 10002, 11105, 0.68

union all select 10003, 11101, 0.23

union all select 10003, 11102, 0.43

union all select 10003, 11103, 0.63

union all select 10003, 11104, 0.83

union all select 10003, 10005, 0.77

union all select 10004, 11101, 0.64

union all select 10004, 11102, 0.44

union all select 10004, 11103, 0.24

union all select 10004, 11104, 0.14

union all select 10004, 11105, 0.22

union all select 10005, 11101, 0.18

union all select 10005, 11102, 0.33

union all select 10005, 11103, 0.37

union all select 10005, 11104, 0.48

union all select 10005, 11105, 0.88

wzxhm 2007-10-07

打赏
举报

谢谢Generics给我的100分。

wzxhm 2007-10-06

打赏
举报

用我的办法试试，测试数据已经通过。欢迎交流，email:xiahm@wztax.gov.cn。

1、建立存储过程
drop procedure sp_ta
go

create procedure sp_ta
as
begin
create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)

create table #TA_DEL(ID1 numeric(18,0), ID2 numeric(18,0), dist float)

insert into #TA
select * from TA A where not exists (select 1 from TA B where B.ID1=A.ID1 and A.dist>B.dist)
and not exists (select 1 from TA C where C.ID2=A.ID2 and A.dist>C.dist)

insert into #TA_DEL
select * from TA A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)

insert into #TA
select * from #TA_DEL A where not exists (select 1 from #TA_DEL B where B.ID1=A.ID1 and A.dist>B.dist)
and not exists (select 1 from #TA_DEL C where C.ID2=A.ID2 and A.dist>C.dist)

insert into #TA
select * from #TA_DEL A where not exists (select 1 from #TA B where A.ID1 = B.ID1)
and not exists (select 1 from #TA C where A.ID2 = C.ID2)

select * from #TA order by dist

end
go

2、给表TA增加索引
create index idx_ta_id1 on TA(ID1)
go
create index idx_ta_id2 on TA(ID2)
go

3、运行存储过程返回结果
exec sp_ta
go

Generics 2007-10-06

打赏
举报

谢谢楼上的大虾. 我还要好好的消化楼上的算法, 好在我有好多表, 都是几万几十万条的, 只要跟C#程序的结果比较一下就知道算法的正确性了, 大虾就等着接分吧.

wzxhm 2007-10-05

打赏
举报

研究了一下,确实很难!结果可能比例子还要复杂.我再试试看,不知能否给你提供一个满意的答复.

josunmarks 2007-09-30

打赏
举报

delete from TA where ID1 not in(select max(ID1) from TA group by ID2)

Generics 2007-09-29

打赏
举报

13楼的好象结果也不对.

Generics 2007-09-29

打赏
举报

谢谢楼上. 不过好象不对.
下面这是一个很典型的例子:
create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)
insert #TA
select 10001, 11101, 0.01
union all select 10001, 11102, 0.21
union all select 10001, 11103, 0.31
union all select 10001, 11104, 0.41
union all select 10002, 11101, 0.12
union all select 10002, 11102, 0.32
union all select 10002, 11103, 0.52
union all select 10002, 11104, 0.72
union all select 10003, 11101, 0.23
union all select 10003, 11102, 0.43
union all select 10003, 11103, 0.63
union all select 10003, 11104, 0.83
union all select 10004, 11101, 0.64
union all select 10004, 11102, 0.44
union all select 10004, 11103, 0.24
union all select 10004, 11104, 0.14

结果应该是
10001 11101 0.01
10004 11104 0.14
10002 11102 0.32
10003 11103 0.63

但楼上的结果是
10004 11104 0.83

qiuming0306 2007-09-29

打赏
举报

create table #TA(ID1 numeric(18,0), ID2 numeric(18,0), dist float)
insert #TA
select 10001, 10001, 0.0
UNION ALL select 10002, 10003, 1.0
UNION ALL select 10003, 10004, 3.0
UNION ALL select 10004, 10005, 10.0
UNION ALL select 10008, 10007, 0.5
UNION ALL select 10005, 10003, 0.8
UNION ALL select 10007, 10002, 0.2
UNION ALL select 10009, 10008, 7.0
UNION ALL select 10008, 10006, 0.8
UNION ALL select 10006, 10007, 0.51
select ID1,max(dist) as dist,max(ID2) as ID2
into #b
from #TA group by ID1
select max(ID1) as ID1 ,ID2,max(dist) as dist
from #b group by ID2
drop table #TA
drop table #b

10001 10001 0
10007 10002 0.2
10005 10003 1
10003 10004 3
10004 10005 10
10008 10007 0.8
10009 10008 7

Generics 2007-09-29