数据最大限度均匀分布的问题

ruan_hg 2009-01-24 09:07:30

有100条数据，其规律是以_red,_black,_yellow,_green,_blue……等等颜色的英文单词结尾，如何使其按照各自后缀均匀分布呢？
如这100条数据中有25条是以_red结尾的，那么我任意取4条，则必须有且只有一条是以_red结尾的，其它类推。
这好象纯粹是个算法问题，在“基础问题”中提出过，没有很好的解决，各位高手看到请帮忙。

...全文

191 14 打赏收藏转发到动态举报

写回复

用AI写文章

14 条回复

切换为时间正序

请发表友善的回复…

发表回复

ruan_hg 2009-01-24

打赏
举报

谢谢楼上，辛苦了！
得到的结果应该是所有的行，只不过重新排列过了。
如150条数据中有40条red,30条gren,20条yellow,10条black,50条blue。
那么，我随机取连续的15行，得到的应该包括4条red,3条gren,2条yellow,1条black,5条blue。这里的随机取15行是指
“抽样调查”的意思，即数据已经排列好了，不能再改顺序了，只能查看、验证是否达到要求的意思。

dawugui 2009-01-24

打赏
举报

create table tb(id int , col varchar(10))

insert into tb values(1,'1_red')

insert into tb values(2,'1_black')

insert into tb values(3,'1_yellow')

insert into tb values(4,'1_green')

insert into tb values(5,'1_blue')

insert into tb values(1,'2_red')

insert into tb values(2,'3_black')

insert into tb values(3,'4_yellow')

insert into tb values(4,'5_green')

insert into tb values(5,'6_blue')

insert into tb values(1,'7_red')

insert into tb values(2,'8_black')

insert into tb values(3,'9_yellow')

insert into tb values(4,'10_green')

insert into tb values(5,'11_blue')

insert into tb values(4,'12_green')

insert into tb values(5,'13_blue')

go



select m.id , m.col from

(

  select * , px = ((select count(*) from tb where id = t.id and col < t.col) + 1) - 1 from tb t

) m , 

(

  select id , count(*) cnt from tb group by id

) n

where m.id = n.id and m.px % n.cnt = 0 --(1,2,3...)



drop table tb

/*

id          col        

----------- ---------- 

1           1_red

2           1_black

3           1_yellow

4           1_green

5           1_blue



（所影响的行数为 5 行）

*/

ruan_hg 2009-01-24

打赏
举报

不是这样的，我是说数据update之后，得到的结果中，随机抽取连续的若干行，其各种颜色的密度＝其各自在总表中的密度

ruan_hg 2009-01-24

打赏
举报

谢谢回复
不是这样的，我是说数据update之后，得到的结果中，随机抽取连续的若干行，其各种颜色的密度＝其各自在总表中的密度。
属于数据结构的问题。

dawugui 2009-01-24

打赏
举报

那么我任意取4条，则必须有且只有一条是以_red结尾的，其它类推。
如果是5个结尾的各要一个.

create table tb(col varchar(10))

insert into tb values('1_red')

insert into tb values('1_black')

insert into tb values('1_yellow')

insert into tb values('1_green')

insert into tb values('1_blue')

insert into tb values('2_red')

insert into tb values('3_black')

insert into tb values('4_yellow')

insert into tb values('5_green')

insert into tb values('6_blue')

insert into tb values('7_red')

insert into tb values('8_black')

insert into tb values('9_yellow')

insert into tb values('10_green')

insert into tb values('11_blue')

go



select top 1 * into tmp1 from tb where substring(col,charindex('_',col),len(col)) = '_red' order by newid() 

select top 1 * into tmp2 from tb where substring(col,charindex('_',col),len(col)) <> '_black' order by newid()

select top 1 * into tmp3 from tb where substring(col,charindex('_',col),len(col)) = '_yellow' order by newid() 

select top 1 * into tmp4 from tb where substring(col,charindex('_',col),len(col)) <> '_green' order by newid()

select top 1 * into tmp5 from tb where substring(col,charindex('_',col),len(col)) = '_blue' order by newid() 



select * from tmp1

union all

select * from tmp2

union all

select * from tmp3

union all

select * from tmp4

union all

select * from tmp5



drop table tb , tmp1 , tmp2, tmp3 , tmp4, tmp5 



/*

col        

---------- 

7_red

5_green

4_yellow

6_blue

1_blue



（所影响的行数为 5 行）

*/

dawugui 2009-01-24

打赏
举报

create table tb(col varchar(10))

insert into tb values('1_red')

insert into tb values('1_black')

insert into tb values('1_yellow')

insert into tb values('1_green')

insert into tb values('1_blue')

insert into tb values('2_red')

insert into tb values('3_black')

insert into tb values('4_yellow')

insert into tb values('5_green')

insert into tb values('6_blue')

insert into tb values('7_red')

insert into tb values('8_black')

insert into tb values('9_yellow')

insert into tb values('10_green')

insert into tb values('11_blue')

go



select top 1 * into tmp1 from tb where substring(col,charindex('_',col),len(col)) = '_red' order by newid() 

select top 3 * into tmp2 from tb where substring(col,charindex('_',col),len(col)) <> '_red' order by newid()



select * from tmp1

union all

select * from tmp2



drop table tb , tmp1 , tmp2



/*

col        

---------- 

2_red

6_blue

1_green

3_black



（所影响的行数为 4 行）

*/

ruan_hg 2009-01-24

打赏
举报

不行的：
我需要的是各自按各自在总数中所占比例均匀分布，查询了一下，发现后几条记录都是_red。
看了不能有newid，随机

jinjazz 2009-01-24

打赏
举报

declare  @t table (v varchar(100))

insert into @t

select top 10 name+'_red' from sysobjects   

insert into @t

select top 25 name+'_yellow' from sysobjects   

insert into @t

select top 30 name+'_blue' from sysobjects  

insert into @t

select top 15 name+'_other' from sysobjects  



select  *   from(

select   *,rn=row_number()over(partition by right(v,4) order by newid()) from @t )a

order by rn,newid()

ruan_hg 2009-01-24

打赏
举报

jinjazz:
将top4去掉发现，排列如下：
name_red
name_yellow
name_blue
name_other
name_other
name_blue
name_yellow
name_red
name_red
name_yellow
name_blue
name_other
name_other
name_blue
name_yellow
name_red
……
已经接近最大限度交错排列的要求了，似乎还可以改进？
另外，如果每种颜色数量不尽相同，是否可以达到相同效果？

jinjazz 2009-01-24

打赏
举报

declare  @t table (v varchar(100))

insert into @t

select top 25 name+'_red' from sysobjects   

insert into @t

select top 25 name+'_yellow' from sysobjects   

insert into @t

select top 25 name+'_blue' from sysobjects  

insert into @t

select top 25 name+'_other' from sysobjects  



select top 4* from(

select   *,rn=row_number()over(partition by right(v,4) order by newid()) from @t )a

order by rn

-狙击手- 2009-01-24

打赏
举报

这个问题以前讨论过，你找找，

要回家来不及了，节后再来看

ruan_hg 2009-01-24

打赏
举报

楼上这位高手：
我的要求是这样的，数据重新排列后，不能再改顺序，然后随机抽样连续的几条查看其分布情况。
也就是说得到的结果是重新排列后的情况，抽样只是一个视图。

fcuandy 2009-01-24

打赏
举报

declare  @t table (v varchar(100))

insert into @t

select top 10 name+'_red' from sys.objects   

insert into @t

select top 25 name+'_yellow' from sys.objects   

insert into @t

select top 30 name+'_blue' from sys.objects 

insert into @t

select top 15 name+'_other' from sys.objects  



select * from @t

/*

sysrowsetcolumns_red

sysrowsets_red

sysallocunits_red

sysfiles1_red

syshobtcolumns_red

syshobts_red

sysftinds_red

sysserefs_red

sysowners_red

sysprivs_red

sysrowsetcolumns_yellow

sysrowsets_yellow

sysallocunits_yellow

sysfiles1_yellow

syshobtcolumns_yellow

syshobts_yellow

sysftinds_yellow

sysserefs_yellow

sysowners_yellow

sysprivs_yellow

sysschobjs_yellow

syscolpars_yellow

sysnsobjs_yellow

syscerts_yellow

sysxprops_yellow

sysscalartypes_yellow

systypedsubobjs_yellow

sysidxstats_yellow

sysiscols_yellow

sysbinobjs_yellow

sysobjvalues_yellow

sysclsobjs_yellow

sysrowsetrefs_yellow

sysremsvcbinds_yellow

sysxmitqueue_yellow

sysrowsetcolumns_blue

sysrowsets_blue

sysallocunits_blue

sysfiles1_blue

syshobtcolumns_blue

syshobts_blue

sysftinds_blue

sysserefs_blue

sysowners_blue

sysprivs_blue

sysschobjs_blue

syscolpars_blue

sysnsobjs_blue

syscerts_blue

sysxprops_blue

sysscalartypes_blue

systypedsubobjs_blue

sysidxstats_blue

sysiscols_blue

sysbinobjs_blue

sysobjvalues_blue

sysclsobjs_blue

sysrowsetrefs_blue

sysremsvcbinds_blue

sysxmitqueue_blue

sysrts_blue

sysconvgroup_blue

sysdesend_blue

sysdercv_blue

syssingleobjrefs_blue

sysrowsetcolumns_other

sysrowsets_other

sysallocunits_other

sysfiles1_other

syshobtcolumns_other

syshobts_other

sysftinds_other

sysserefs_other

sysowners_other

sysprivs_other

sysschobjs_other

syscolpars_other

sysnsobjs_other

syscerts_other

sysxprops_other

*/



declare @n int

set @n=8  --抽样取8条



select v from

	(select row_number() over(partition by stuff(v,1,charindex('_',v),'') order by newid()) idx,stuff(v,1,charindex('_',v),'') color,* from @t) a

cross apply

	(

	select count(*) cntAll from @t

	) b

cross apply

	(

		select count(*) cnt from @t

		where stuff(v,1,charindex('_',v),'') = a.color

	) c

	where cast(round(cnt*1.0*@n/cntAll,0) as int)>=idx



/*

syshobts_blue

sysftinds_blue

syscolpars_blue



syshobts_other

syscolpars_other



sysrowsets_red



sysscalartypes_yellow

sysidxstats_yellow

sysftinds_yellow

*/



/*

10:30:25:15

2:6:5:3



结果共选了9条，原因很简单，8并不是这几个数互质比例的最小公倍数,所以拼结果时会有出入，如果是选 2+6+5+3 即 16条时(总数/最大公约数)可以选到最匹配的结果:

*/

set @n=16



select v from

	(select row_number() over(partition by stuff(v,1,charindex('_',v),'') order by newid()) idx,stuff(v,1,charindex('_',v),'') color,* from @t) a

cross apply

	(

	select count(*) cntAll from @t

	) b

cross apply

	(

		select count(*) cnt from @t

		where stuff(v,1,charindex('_',v),'') = a.color

	) c

	where cast(round(cnt*1.0*@n/cntAll,0) as int)>=idx



/*

sysnsobjs_blue

sysclsobjs_blue

syscerts_blue

sysrowsets_blue

sysserefs_blue

sysconvgroup_blue

blue :totals 6



sysprivs_other

sysrowsetcolumns_other

sysxprops_other

other: totals 3



sysprivs_red

sysfiles1_red

red: totals 2



sysclsobjs_yellow

sysrowsets_yellow

syscerts_yellow

syshobtcolumns_yellow

sysiscols_yellow

yellow: totals 5

*/



/*

至于取8条时，取到了9条，如何取到8条，看你如何取舍了，即取舍规则是什么。

*/





/*

而下面这样，通过增加小数精度来比较,也是有问题的.

*/

set @n=8  



select v from

	(select row_number() over(partition by stuff(v,1,charindex('_',v),'') order by newid()) idx,stuff(v,1,charindex('_',v),'') color,* from @t) a

cross apply

	(

	select count(*) cntAll from @t

	) b

cross apply

	(

		select count(*) cnt from @t

		where stuff(v,1,charindex('_',v),'') = a.color

	) c

	where cnt*1.0*@n/cntAll >=cnt*1.0*idx/cnt



/*

sysxmitqueue_blue

sysrts_blue

systypedsubobjs_blue

sysrowsets_other

sysrowsets_red

sysscalartypes_yellow

syscolpars_yellow

*/