Storm Trident DPRC不稳定？

qq_39052701 2018-09-23 02:13:51

自己改写了一个trident drpc的例子，就是推文包含#FIFA的统计，包含三个类，各类说明和源码如下：
1.FakeTweetSpout ，用于产生（推文，国家）的batch tuples。为了观察输出结果，在其emitBatch方法里只允许发送batchId为1的仅1个批次的tuples（63行）。



package L08bigdata.storm.trident.drpc;



import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Random;



import org.apache.storm.task.TopologyContext;

import org.apache.storm.trident.operation.TridentCollector;

import org.apache.storm.trident.spout.IBatchSpout;

import org.apache.storm.tuple.Fields;

import org.apache.storm.tuple.Values;



public class FakeTweetSpout implements IBatchSpout{



	

	private static final long serialVersionUID = 10L;

	private int batchSize;

	private HashMap<Long, List<List<Object>>> batchesMap = new HashMap<Long, List<List<Object>>>();

	public FakeTweetSpout(int batchSize) {

		this.batchSize = batchSize;

	}

	

	private static final Map<Integer, String> TWEET_MAP = new HashMap<Integer, String>();

	static {

		TWEET_MAP.put(0, "#FIFA worldcup");

		TWEET_MAP.put(1, "#FIFA worldcup");

		TWEET_MAP.put(2, "#FIFA worldcup");

		TWEET_MAP.put(3, "#FIFA worldcup");

		TWEET_MAP.put(4, "#Movie top 10");

	}

	

	private static final Map<Integer, String> COUNTRY_MAP = new HashMap<Integer, String>();

	static {

		COUNTRY_MAP.put(0, "United State");

		COUNTRY_MAP.put(1, "Japan");

		COUNTRY_MAP.put(2, "India");

		COUNTRY_MAP.put(3, "China");

		COUNTRY_MAP.put(4, "Brazil");

	}

	

	private List<Object> recordGenerator() {

		final Random rand = new Random();

		int randomNumber = rand.nextInt(5);

		int randomNumber2 = rand.nextInt(5);

		System.out.println(TWEET_MAP.get(randomNumber)+"      "+COUNTRY_MAP.get(randomNumber2));



		return new Values(TWEET_MAP.get(randomNumber),COUNTRY_MAP.get(randomNumber2));

	}

	

	public void ack(long batchId) {

		this.batchesMap.remove(batchId);

		

	}



	public void close() {

		// TODO Auto-generated method stub

		

	}



	public void emitBatch(long batchId, TridentCollector collector) {

		if(batchId!=1) //此时就发送一个batch，对比rpc的execute函数执行情况。

			return;

		List<List<Object>> batches = this.batchesMap.get(batchId);

		if(batches == null) {

			batches = new ArrayList<List<Object>>();;

			for (int i=0;i < this.batchSize;i++) {

				batches.add(this.recordGenerator());

			}

			this.batchesMap.put(batchId, batches);

		}

		for(List<Object> list : batches){

            collector.emit(list);

        }

		

	}



	public Map getComponentConfiguration() {

		// TODO Auto-generated method stub

		return null;

	}



	public Fields getOutputFields() {

		return new Fields("text","Country");

	}



	public void open(Map arg0, TopologyContext arg1) {

		// TODO Auto-generated method stub

		

	}



}

2.TridentUtility，封装FakeTweetSpout所用的function、filter。



package L08bigdata.storm.trident.drpc;



import org.apache.storm.trident.operation.BaseFilter;

import org.apache.storm.trident.operation.BaseFunction;

import org.apache.storm.trident.operation.TridentCollector;

import org.apache.storm.trident.tuple.TridentTuple;

import org.apache.storm.tuple.Values;



public class TridentUtility {

	/**

	 * Get the comma separated value as input, split the field by comma, and

	 * then emits multiple tuple as output.

	 * 

	 */

	public static class Split extends BaseFunction {



		private static final long serialVersionUID = 2L;



		public void execute(TridentTuple tuple, TridentCollector collector) {

			String countries = tuple.getString(0);

			//System.out.println(countries);

			for (String word : countries.split(",")) {

				//System.out.println("word -"+word);

				collector.emit(new Values(word));

			}

		}

	}



	/**

	 * This class extends BaseFilter and contain isKeep method which emits only

	 * those tuple which has #FIFA in text field.

	 */

	public static class TweetFilter extends BaseFilter {



		private static final long serialVersionUID = 1L;



		public boolean isKeep(TridentTuple tuple) {

			if (tuple.getString(0).contains("#FIFA")) {

				return true;

			} else {

				return false;

			}

		}



	}



	/**

	 * This class extends BaseFilter and contain isKeep method which will print

	 * the input tuple.

	 * 

	 */

	public static class Print extends BaseFilter {



		private static final long serialVersionUID = 1L;



		public boolean isKeep(TridentTuple tuple) {

			//System.out.println(tuple);

			return true;

		}



	}

}

3.DistributedRPC ，生成包含两个spout的topology，提交，然后启动100次针对Japan的查询。为了查看完整的远程调用结果，代码的最后一句each(new Fields("aa"),new FilterNull())在此处给注释了。



package L08bigdata.storm.trident.drpc;

import org.apache.storm.Config;

import org.apache.storm.LocalCluster;

import org.apache.storm.LocalDRPC;

import org.apache.storm.StormSubmitter;

import org.apache.storm.generated.StormTopology;

import org.apache.storm.trident.TridentState;

import org.apache.storm.trident.TridentTopology;

import org.apache.storm.trident.operation.builtin.Count;

import org.apache.storm.trident.operation.builtin.FilterNull;

import org.apache.storm.trident.operation.builtin.MapGet;

import org.apache.storm.trident.testing.MemoryMapState;

import org.apache.storm.tuple.Fields;

import org.apache.storm.utils.DRPCClient;



public class DistributedRPC {

		

	public static void main(String[] args) throws Exception {

		Config conf = new Config();

		

		if (args.length == 0) {

			LocalDRPC drpc = new LocalDRPC();

			LocalCluster cluster = new LocalCluster();

			cluster.submitTopology("CountryCount", conf, buildTopology(drpc));

			

			Thread.sleep(2000);//等待topology启动，然后再开始查。

			

			for(int i=0; i<100 ; i++) {

				System.out.println("Result - "+drpc.execute("queryFunction", "Japan")); //第一个参数是函数名

				Thread.sleep(3000);

				}

		} 

		

		else {

			conf.setNumWorkers(3);

			StormSubmitter.submitTopology(args[0], conf, buildTopology(null));

			Thread.sleep(2000);

   		  	DRPCClient client = new DRPCClient(conf, "RRPC-Server", 1234);

   		  	System.out.println(client.execute("Count", "Japan"));

		}

	}



	

	public static StormTopology buildTopology(LocalDRPC drpc) {



		FakeTweetSpout spout = new FakeTweetSpout(10);

		TridentTopology topology = new TridentTopology();

		

		//一个topology创建两个stream，一个是数据处理stream，一个是drpc stream。

		

		

		//创建一个名为spout1的流，从spout中接收数据

		TridentState countryCount = topology.newStream("spout1", spout) //返回值countryCount是由persistentAggregate产生的，当然可以查询。

				.shuffle()

				.each(new Fields("text","Country"), new TridentUtility.TweetFilter()).groupBy(new Fields("Country"))

				.persistentAggregate(new MemoryMapState.Factory(),new Fields("Country"), new Count(), new Fields("count"))//在内存中形成（Country-count）对

				.parallelismHint(2);

		

		try {

			Thread.sleep(2000);

		} catch (InterruptedException e) {

		}

		

		topology.newDRPCStream("queryFunction", drpc)//第一个参数是定义的函数名

		.each(new Fields("args"), new TridentUtility.Split(), new Fields("Country_to_be_queried"))	

		

		//由于内存中形成（key，value）结构，

		//所以下面的方法中第一个参数是待查询对象

		//第二个是待查询key的值

		//第三个是查询得到的value的值赋予的field值。

		//注意，经过测试发现，最后一个参数中field的值随便写，//没有必要一定是上个流中的“count”。

		.stateQuery(countryCount, new Fields("Country_to_be_queried"), new MapGet(),new Fields("aa"));

		//.each(new Fields("aa"),new FilterNull());

	}

}

我使用local cluster模式运行。
运行结果很奇怪。我认为，由于限定了只发送一个batch的数据，那么，在后期drpc查询时，查询的结果（来自Japan的推文中，包含#FIFA的推文个数）应该始终是一样的，但实际结果很奇怪。如，batch数据如下：



#Movie top 10      United State

#FIFA worldcup      United State

#FIFA worldcup      China

#FIFA worldcup      India

#FIFA worldcup      Japan

#FIFA worldcup      China

#FIFA worldcup      Japan

#Movie top 10      India

#FIFA worldcup      United State

#FIFA worldcup      United State

也就是说，来自Japan的推文中，包含#FIFA的推文应该是2个。
而在100次调用中，打印输出结果片段如下：



Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",null]]

Result - [["Japan","Japan",null]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",null]]

Result - [["Japan","Japan",2]]

Result - [["Japan","Japan",null]]

Result - [["Japan","Japan",null]]

Result - [["Japan","Japan",2]]