Storm Trident DPRC不稳定?

qq_39052701 2018-09-23 02:13:51
自己改写了一个trident drpc的例子,就是推文包含#FIFA的统计,包含三个类,各类说明和源码如下:
1.FakeTweetSpout ,用于产生(推文,国家)的batch tuples。为了观察输出结果,在其emitBatch方法里只允许发送batchId为1的仅1个批次的tuples(63行)。

package L08bigdata.storm.trident.drpc;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.storm.task.TopologyContext;
import org.apache.storm.trident.operation.TridentCollector;
import org.apache.storm.trident.spout.IBatchSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;

public class FakeTweetSpout implements IBatchSpout{


private static final long serialVersionUID = 10L;
private int batchSize;
private HashMap<Long, List<List<Object>>> batchesMap = new HashMap<Long, List<List<Object>>>();
public FakeTweetSpout(int batchSize) {
this.batchSize = batchSize;
}

private static final Map<Integer, String> TWEET_MAP = new HashMap<Integer, String>();
static {
TWEET_MAP.put(0, "#FIFA worldcup");
TWEET_MAP.put(1, "#FIFA worldcup");
TWEET_MAP.put(2, "#FIFA worldcup");
TWEET_MAP.put(3, "#FIFA worldcup");
TWEET_MAP.put(4, "#Movie top 10");
}

private static final Map<Integer, String> COUNTRY_MAP = new HashMap<Integer, String>();
static {
COUNTRY_MAP.put(0, "United State");
COUNTRY_MAP.put(1, "Japan");
COUNTRY_MAP.put(2, "India");
COUNTRY_MAP.put(3, "China");
COUNTRY_MAP.put(4, "Brazil");
}

private List<Object> recordGenerator() {
final Random rand = new Random();
int randomNumber = rand.nextInt(5);
int randomNumber2 = rand.nextInt(5);
System.out.println(TWEET_MAP.get(randomNumber)+" "+COUNTRY_MAP.get(randomNumber2));

return new Values(TWEET_MAP.get(randomNumber),COUNTRY_MAP.get(randomNumber2));
}

public void ack(long batchId) {
this.batchesMap.remove(batchId);

}

public void close() {
// TODO Auto-generated method stub

}

public void emitBatch(long batchId, TridentCollector collector) {
if(batchId!=1) //此时就发送一个batch,对比rpc的execute函数执行情况。
return;
List<List<Object>> batches = this.batchesMap.get(batchId);
if(batches == null) {
batches = new ArrayList<List<Object>>();;
for (int i=0;i < this.batchSize;i++) {
batches.add(this.recordGenerator());
}
this.batchesMap.put(batchId, batches);
}
for(List<Object> list : batches){
collector.emit(list);
}

}

public Map getComponentConfiguration() {
// TODO Auto-generated method stub
return null;
}

public Fields getOutputFields() {
return new Fields("text","Country");
}

public void open(Map arg0, TopologyContext arg1) {
// TODO Auto-generated method stub

}

}

2.TridentUtility,封装FakeTweetSpout所用的function、filter。

package L08bigdata.storm.trident.drpc;

import org.apache.storm.trident.operation.BaseFilter;
import org.apache.storm.trident.operation.BaseFunction;
import org.apache.storm.trident.operation.TridentCollector;
import org.apache.storm.trident.tuple.TridentTuple;
import org.apache.storm.tuple.Values;

public class TridentUtility {
/**
* Get the comma separated value as input, split the field by comma, and
* then emits multiple tuple as output.
*
*/
public static class Split extends BaseFunction {

private static final long serialVersionUID = 2L;

public void execute(TridentTuple tuple, TridentCollector collector) {
String countries = tuple.getString(0);
//System.out.println(countries);
for (String word : countries.split(",")) {
//System.out.println("word -"+word);
collector.emit(new Values(word));
}
}
}

/**
* This class extends BaseFilter and contain isKeep method which emits only
* those tuple which has #FIFA in text field.
*/
public static class TweetFilter extends BaseFilter {

private static final long serialVersionUID = 1L;

public boolean isKeep(TridentTuple tuple) {
if (tuple.getString(0).contains("#FIFA")) {
return true;
} else {
return false;
}
}

}

/**
* This class extends BaseFilter and contain isKeep method which will print
* the input tuple.
*
*/
public static class Print extends BaseFilter {

private static final long serialVersionUID = 1L;

public boolean isKeep(TridentTuple tuple) {
//System.out.println(tuple);
return true;
}

}
}

3.DistributedRPC ,生成包含两个spout的topology,提交,然后启动100次针对Japan的查询。为了查看完整的远程调用结果,代码的最后一句each(new Fields("aa"),new FilterNull())在此处给注释了。

package L08bigdata.storm.trident.drpc;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.LocalDRPC;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.trident.TridentState;
import org.apache.storm.trident.TridentTopology;
import org.apache.storm.trident.operation.builtin.Count;
import org.apache.storm.trident.operation.builtin.FilterNull;
import org.apache.storm.trident.operation.builtin.MapGet;
import org.apache.storm.trident.testing.MemoryMapState;
import org.apache.storm.tuple.Fields;
import org.apache.storm.utils.DRPCClient;

public class DistributedRPC {

public static void main(String[] args) throws Exception {
Config conf = new Config();

if (args.length == 0) {
LocalDRPC drpc = new LocalDRPC();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("CountryCount", conf, buildTopology(drpc));

Thread.sleep(2000);//等待topology启动,然后再开始查。

for(int i=0; i<100 ; i++) {
System.out.println("Result - "+drpc.execute("queryFunction", "Japan")); //第一个参数是函数名
Thread.sleep(3000);
}
}

else {
conf.setNumWorkers(3);
StormSubmitter.submitTopology(args[0], conf, buildTopology(null));
Thread.sleep(2000);
DRPCClient client = new DRPCClient(conf, "RRPC-Server", 1234);
System.out.println(client.execute("Count", "Japan"));
}
}


public static StormTopology buildTopology(LocalDRPC drpc) {

FakeTweetSpout spout = new FakeTweetSpout(10);
TridentTopology topology = new TridentTopology();

//一个topology创建两个stream,一个是数据处理stream,一个是drpc stream。


//创建一个名为spout1的流,从spout中接收数据
TridentState countryCount = topology.newStream("spout1", spout) //返回值countryCount是由persistentAggregate产生的,当然可以查询。
.shuffle()
.each(new Fields("text","Country"), new TridentUtility.TweetFilter()).groupBy(new Fields("Country"))
.persistentAggregate(new MemoryMapState.Factory(),new Fields("Country"), new Count(), new Fields("count"))//在内存中形成(Country-count)对
.parallelismHint(2);

try {
Thread.sleep(2000);
} catch (InterruptedException e) {
}

topology.newDRPCStream("queryFunction", drpc)//第一个参数是定义的函数名
.each(new Fields("args"), new TridentUtility.Split(), new Fields("Country_to_be_queried"))

//由于内存中形成(key,value)结构,
//所以下面的方法中第一个参数是待查询对象
//第二个是待查询key的值
//第三个是查询得到的value的值赋予的field值。
//注意,经过测试发现,最后一个参数中field的值随便写,//没有必要一定是上个流中的“count”。
.stateQuery(countryCount, new Fields("Country_to_be_queried"), new MapGet(),new Fields("aa"));
//.each(new Fields("aa"),new FilterNull());
}
}



我使用local cluster模式运行。
运行结果很奇怪。我认为,由于限定了只发送一个batch的数据,那么,在后期drpc查询时,查询的结果(来自Japan的推文中,包含#FIFA的推文个数)应该始终是一样的,但实际结果很奇怪。如,batch数据如下:

#Movie top 10 United State
#FIFA worldcup United State
#FIFA worldcup China
#FIFA worldcup India
#FIFA worldcup Japan
#FIFA worldcup China
#FIFA worldcup Japan
#Movie top 10 India
#FIFA worldcup United State
#FIFA worldcup United State

也就是说,来自Japan的推文中,包含#FIFA的推文应该是2个。
而在100次调用中,打印输出结果片段如下:

Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",null]]
Result - [["Japan","Japan",null]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",null]]
Result - [["Japan","Japan",2]]
Result - [["Japan","Japan",null]]
Result - [["Japan","Japan",null]]
Result - [["Japan","Japan",2]]


各位大神,为什么查询结果这么不稳定?是因为local cluster 模式的原因?或是有什么我没有掌握的知识点?

...全文
193 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

20,808

社区成员

发帖
与我相关
我的任务
社区描述
Hadoop生态大数据交流社区,致力于有Hadoop,hive,Spark,Hbase,Flink,ClickHouse,Kafka,数据仓库,大数据集群运维技术分享和交流等。致力于收集优质的博客
社区管理员
  • 分布式计算/Hadoop社区
  • 涤生大数据
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧