mysql分层随机抽样

benpao002 2018-12-26 06:16:59
想用mysql实现分层随机抽样怎么写,样例数据如下。想从label为s1随机选4个,label为s4随机选2个,label为s10随机选3个,数据量比较大,label很多,有好几百个,只弄了3个label样例数据,想问下这个sql怎么写
id label n
1000000000000059993 s1 4
1000000000000925982 s1 4
1000000000001199348 s1 4
1000000000001638782 s1 4
1000000000001682891 s1 4
1000000000002584699 s1 4
1000000000000079389 s1 4
1000000000001217799 s1 4
101050481232942966 s1 4
676549513128752044 s1 4
890249072426673466 s1 4
212745469495313447 s4 2
510346678931418680 s4 2
803746678931739658 s4 2
670647327506531562 s4 2
1000000000003045625 s4 2
1000000000002091492 s4 2
1000000000002856240 s4 2
1000000000003058272 s4 2
1000000000000734975 s10 3
1000000000002910312 s10 3
1000000000000260015 s10 3
1000000000001252400 s10 3
266251025098187409 s10 3
509450463862531415 s10 3
845751401207367664 s10 3
1000000000002407715 s10 3
1000000000002940425 s10 3
...全文
1184 4 打赏 收藏 转发到动态 举报
写回复
用AI写文章
4 条回复
切换为时间正序
请发表友善的回复…
发表回复
Inter_Ares 2019-01-02
  • 打赏
  • 举报
回复
一个SQL文应该实现不了吧。我觉得可以分步来实现。
1、写一个存储过程,统计实时的标签信息,写入一张临时表。
如:标签,数量
2、临时表可以加一个字段,填你要随机取的该标签的记录数,不能超过统计值。如果是随机的,在上面那个存储过程里就一起实现。
3、在写一个存储过程,循环执行这个临时表的数据,根据记录数循环产生随机数,然后去查实际记录,可能需要判断是否有重复。

如果取得标签数也是随机的,可以一个存储过程里实现,也只需要一个最终结果表。
如果要根据实际情况,每次要特定设定的话,需要写两个存储过程,一个临时表,一个最终结果表。
benpao002 2018-12-28
  • 打赏
  • 举报
回复
重新研究了下,要是这个rand括号里面的参数取出来的数字不重复,是可以运行的,要是有重复就程序报错,想问下这种该怎么解决优化呢,如果选择的分层抽样变量很多,感觉很难有随机数符合要求,现在我的label变量有200多个。求指导,非常感谢

引用 1 楼 吉普赛的歌 的回复:
此种写法主要从效率方面考虑。你先试下是否合用
DROP TABLE IF EXISTS t; 
CREATE TABLE t(
id bigint
,label VARCHAR(20)
,n INT
);
INSERT INTO t VALUES('1000000000000059993','s1','4');
INSERT INTO t VALUES('1000000000000925982','s1','4');
INSERT INTO t VALUES('1000000000001199348','s1','4');
INSERT INTO t VALUES('1000000000001638782','s1','4');
INSERT INTO t VALUES('1000000000001682891','s1','4');
INSERT INTO t VALUES('1000000000002584699','s1','4');
INSERT INTO t VALUES('1000000000000079389','s1','4');
INSERT INTO t VALUES('1000000000001217799','s1','4');
INSERT INTO t VALUES('101050481232942966','s1','4');
INSERT INTO t VALUES('676549513128752044','s1','4');
INSERT INTO t VALUES('890249072426673466','s1','4');
INSERT INTO t VALUES('212745469495313447','s4','2');
INSERT INTO t VALUES('510346678931418680','s4','2');
INSERT INTO t VALUES('803746678931739658','s4','2');
INSERT INTO t VALUES('670647327506531562','s4','2');
INSERT INTO t VALUES('1000000000003045625','s4','2');
INSERT INTO t VALUES('1000000000002091492','s4','2');
INSERT INTO t VALUES('1000000000002856240','s4','2');
INSERT INTO t VALUES('1000000000003058272','s4','2');
INSERT INTO t VALUES('1000000000000734975','s10','3');
INSERT INTO t VALUES('1000000000002910312','s10','3');
INSERT INTO t VALUES('1000000000000260015','s10','3');
INSERT INTO t VALUES('1000000000001252400','s10','3');
INSERT INTO t VALUES('266251025098187409','s10','3');
INSERT INTO t VALUES('509450463862531415','s10','3');
INSERT INTO t VALUES('845751401207367664','s10','3');
INSERT INTO t VALUES('1000000000002407715','s10','3');
INSERT INTO t VALUES('1000000000002940425','s10','3');

#1. 表上创建索引
create index ix_t_label_id on t (label,id);

#2. 增加序数表, 如果需要更多数值,自己添加
drop table if exists seq;
create table seq(
id int
);
insert into seq values(1);
insert into seq values(2);
insert into seq values(3);
insert into seq values(4);
insert into seq values(5);
insert into seq values(6);
insert into seq values(7);
insert into seq values(8);
insert into seq values(9);
insert into seq values(10);

#3. 创建临时表
drop TEMPORARY table if exists tmp;
create TEMPORARY table tmp(
label varchar(20),
rid int,
primary key(label,rid)
);
#4. 插入生成好的 随机数及对应的位置 的数据到临时表
insert into tmp
select label,FLOOR(1+rand(3)*(cnt-1)) as rid from (
select
id
,label
,count(1) as cnt
,case when label='s1' then 4
when label='s4' then 2
when label='s10' then 3
else 0 end
as randomNum
from t
where label in ('s1','s4','s10')
group by label
) as tt cross join seq on tt.randomNum>=seq.id
order by label;
#select * from tmp;

#原表以 label 分组排名得到rid, 根据连接获取需要数据
select a.id
,a.label
,a.n
from (
select *
,(select count(1) from t as b where a.label=b.label and a.id>=b.id) as rid
from t as a
) as a inner join tmp on a.label=tmp.label and a.rid=tmp.rid
order by a.label,a.rid ;

/*
+---------------------+-------+---+
| id | label | n |
+---------------------+-------+---+
| 676549513128752044 | s1 | 4 |
| 1000000000000059993 | s1 | 4 |
| 1000000000001199348 | s1 | 4 |
| 1000000000001682891 | s1 | 4 |
| 509450463862531415 | s10 | 3 |
| 1000000000000260015 | s10 | 3 |
| 1000000000001252400 | s10 | 3 |
| 1000000000002091492 | s4 | 2 |
| 1000000000003045625 | s4 | 2 |
+---------------------+-------+---+
*/
吉普赛的歌 2018-12-27
  • 打赏
  • 举报
回复
此种写法主要从效率方面考虑。你先试下是否合用
DROP TABLE IF EXISTS t; 
CREATE TABLE t(
id bigint
,label VARCHAR(20)
,n INT	
);
INSERT INTO t VALUES('1000000000000059993','s1','4');
INSERT INTO t VALUES('1000000000000925982','s1','4');
INSERT INTO t VALUES('1000000000001199348','s1','4');
INSERT INTO t VALUES('1000000000001638782','s1','4');
INSERT INTO t VALUES('1000000000001682891','s1','4');
INSERT INTO t VALUES('1000000000002584699','s1','4');
INSERT INTO t VALUES('1000000000000079389','s1','4');
INSERT INTO t VALUES('1000000000001217799','s1','4');
INSERT INTO t VALUES('101050481232942966','s1','4');
INSERT INTO t VALUES('676549513128752044','s1','4');
INSERT INTO t VALUES('890249072426673466','s1','4');
INSERT INTO t VALUES('212745469495313447','s4','2');
INSERT INTO t VALUES('510346678931418680','s4','2');
INSERT INTO t VALUES('803746678931739658','s4','2');
INSERT INTO t VALUES('670647327506531562','s4','2');
INSERT INTO t VALUES('1000000000003045625','s4','2');
INSERT INTO t VALUES('1000000000002091492','s4','2');
INSERT INTO t VALUES('1000000000002856240','s4','2');
INSERT INTO t VALUES('1000000000003058272','s4','2');
INSERT INTO t VALUES('1000000000000734975','s10','3');
INSERT INTO t VALUES('1000000000002910312','s10','3');
INSERT INTO t VALUES('1000000000000260015','s10','3');
INSERT INTO t VALUES('1000000000001252400','s10','3');
INSERT INTO t VALUES('266251025098187409','s10','3');
INSERT INTO t VALUES('509450463862531415','s10','3');
INSERT INTO t VALUES('845751401207367664','s10','3');
INSERT INTO t VALUES('1000000000002407715','s10','3');
INSERT INTO t VALUES('1000000000002940425','s10','3');

#1. 表上创建索引
create index ix_t_label_id on t (label,id);

#2. 增加序数表, 如果需要更多数值,自己添加
drop table if exists seq;
create table seq(
id int
);
insert into seq values(1);
insert into seq values(2);
insert into seq values(3);
insert into seq values(4);
insert into seq values(5);
insert into seq values(6);
insert into seq values(7);
insert into seq values(8);
insert into seq values(9);
insert into seq values(10);

#3. 创建临时表
drop TEMPORARY table if exists tmp;
create TEMPORARY table tmp(
   label varchar(20),
	 rid   int,
	 primary key(label,rid)
);
#4. 插入生成好的 随机数及对应的位置 的数据到临时表
insert into tmp
select label,FLOOR(1+rand(3)*(cnt-1)) as rid from (
select 
id
,label
,count(1) as cnt
,case when label='s1' then 4 
when label='s4' then 2
when label='s10' then 3
else 0 end
as randomNum
from t
where label in ('s1','s4','s10')
group by label
) as tt cross join seq on tt.randomNum>=seq.id
order by label;
#select * from tmp;

#原表以 label 分组排名得到rid, 根据连接获取需要数据
select a.id
,a.label
,a.n
from (
select * 
,(select count(1) from t as b where a.label=b.label and a.id>=b.id) as rid
from t as a
) as a inner join tmp on a.label=tmp.label and a.rid=tmp.rid
order by a.label,a.rid ;

/*
+---------------------+-------+---+
| id                  | label | n |
+---------------------+-------+---+
|  676549513128752044 | s1    | 4 |
| 1000000000000059993 | s1    | 4 |
| 1000000000001199348 | s1    | 4 |
| 1000000000001682891 | s1    | 4 |
|  509450463862531415 | s10   | 3 |
| 1000000000000260015 | s10   | 3 |
| 1000000000001252400 | s10   | 3 |
| 1000000000002091492 | s4    | 2 |
| 1000000000003045625 | s4    | 2 |
+---------------------+-------+---+
*/
benpao002 2018-12-27
  • 打赏
  • 举报
回复
想问下改个随机数就报错,把rand(3) 变为 rand(4)就报错。 [Err] 1062 - Duplicate entry 's1-7' for key 'PRIMARY'。我把primary key(label,rid)这个删掉,抽的数据就有重复,怎么能解决呢

56,687

社区成员

发帖
与我相关
我的任务
社区描述
MySQL相关内容讨论专区
社区管理员
  • MySQL
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧