20,808
社区成员
发帖
与我相关
我的任务
分享
SELECT * FROM A
WHERE EXISTS(
SELECT * FROM B WHERE A.C1=B.C1 AND A.C2=B.C2)
SELECT C1,C2
FROM (
SELECT DISTINCT C1,C2 FROM A
UNION ALL
SELECT DISTINCT C1,C2 FROM B
)TB
GROUP BY C1,C2 HAVING COUNT(1)>1
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class plumebobo0002 {
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
Text myKey = new Text();
Text myValue = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (!value.toString().contains("|"))
return;
String[] myStr = value.toString().split("\\|");
myKey.set(myStr[0] + "|" + myStr[1]); // 只要第一第二个列就行了
if (myStr.length > 2) {
myValue.set("W"); // 资料库数据
}
else {
myValue.set("L"); // List数据
}
context.write(myKey, myValue);
}
}
public static class MyCombiner extends Reducer<Text, Text, Text, Text>
{
Text myKey = new Text();
Text myValue = new Text();
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// 不要小看这个Combiner,在大数据情况下,性能明显提升
myValue.set(values.iterator().next().toString()); // 资料数据和List数据不会出现在一个Map中,所以只取第一条数据即可
context.write(key, myValue);
}
}
public static class MyReducer extends Reducer<Text, Text, Text, NullWritable>
{
Text myKey = new Text();
NullWritable myValue;
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int WordCount = 0;
int ListCount = 0;
for (Text val : values) {
if (val.toString().equals("W") && WordCount == 0) {
WordCount++;
}
else if (val.toString().equals("L") && ListCount == 0)
{
ListCount++;
}
}
// 抛出所有Count大于1的数据
if (WordCount == 1 && ListCount == 1)
context.write(key, myValue);
}
}
public static void main(String[] args) throws Exception {
String Oarg[] = new String[3];
Oarg[0] = "/tmp/plumebobo/test0002"; // 资料库文件
Oarg[1] = "/tmp/plumebobo/out002";
Oarg[2] = "/tmp/plumebobo/keyword0002"; // list文件
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "m04.ct1.r01.hdp:9001");
Job job = new Job(conf, "plumebobo0002");
job.setJarByClass(plumebobo0002.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyCombiner.class);
job.setReducerClass(MyReducer.class);
job.setNumReduceTasks(1);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(Oarg[0]));
FileInputFormat.addInputPath(job, new Path(Oarg[2]));
FileOutputFormat.setOutputPath(job, new Path(Oarg[1]));
job.waitForCompletion(true);
}
}
import java.util.ArrayList;
import java.util.List;
public class Test
{
public static void main(String[] args) {
String lineValue = "12345678905|read.qidian.com|http://read.qidian.com/BookReader/2932090,50482961.aspx|0|2932090";
String[] tempArray = lineValue.split("\\|");
StringBuffer str = new StringBuffer();
String secondUrl = tempArray[1];//二级域名
String bookId = tempArray[4];//bookid
String tempStr = secondUrl + "|" + bookId;//用来indexof的临时字符
boolean isFlag = false;//是否在知识库状态标识
List list = new ArrayList();
list.add("read.qidian.com|2932091|bookname1|author1|xuanhuan");
list.add("read.qidian.com|2932092|bookname2|author2|xuanhuan");
list.add("read.qidian.com|2932093|bookname3|author3|yanqing");
list.add("read.qidian.com|2932094|bookname4|author4|yanqing");
list.add("read.qidian.com|2932090|bookname5|author5|xuanhuan");
int listLength = list.size();
for(int i=0;i<=listLength-1;i++)
{
if(list.get(i).toString().indexOf(tempStr)!= -1)
{
isFlag = true;//是否在知识库状态标识变为true;
System.out.println("-----该网站已经存在于知识库中-----");
}
}
}
}
第二个问题大概想做成这样- -
其实就是想知道在那个list怎样传到mapper里面。。。
有点无从下手。。。在看书找例子。。。光看API有点伤。。。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class plumebobo0001 {
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
Text myKey = new Text();
Text myValue = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (!value.toString().contains("|"))
return;
String[] myStr = value.toString().split("\\|");
for (int i = 1; i < myStr.length; i++) {
myKey.set(myStr[0] + "|" + myStr[i]); //把数据放在Key中输出,value空
context.write(myKey, myValue);
}
}
}
public static class MyReducer extends Reducer<Text, Text, Text, NullWritable>
{
Text myKey = new Text();
NullWritable myValue;
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
StringBuilder myStr = new StringBuilder("");
//迭代取出Key中的数据
//重写了grouping,所以这里不用再作二次排序
for (Text val : values) {
if (myStr.length() == 0) {
myStr.append(key.toString());
}
else {
myStr.append("|");
myStr.append(key.toString().split("\\|")[1]);
}
}
myKey.set(myStr.toString());
context.write(myKey, myValue);
}
}
public static void main(String[] args) throws Exception {
String Oarg[] = new String[2];
Oarg[0] = "/tmp/plumebobo/test0001";
Oarg[1] = "/tmp/plumebobo/out001";
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "m04.ct1.r01.hdp:9001");
Job job = new Job(conf, "plumebobo0001");
job.setJarByClass(plumebobo0001.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setNumReduceTasks(1);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(MyPartitioner.class);
job.setGroupingComparatorClass(MyGroupingComparator.class);
FileInputFormat.setInputPaths(job, new Path(Oarg[0]));
FileOutputFormat.setOutputPath(job, new Path(Oarg[1]));
job.waitForCompletion(true);
}
}
// 根据第一列 分区
class MyPartitioner extends HashPartitioner<Text, Text>
{
@Override
public int getPartition(Text key, Text value, int numPartitions) {
Text cols = new Text(key.toString().split("\\|")[0]);
return super.getPartition(cols, value, numPartitions);// cols[0]
}
}
// 以第一列 值 分组
class MyGroupingComparator implements RawComparator<Text>
{
// @Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
Text key1 = new Text();
Text key2 = new Text();
DataInputBuffer buffer = new DataInputBuffer();
try {
buffer.reset(b1, s1, l1);
key1.readFields(buffer);
buffer.reset(b2, s2, l2);
key2.readFields(buffer);
}
catch (IOException e) {
throw new RuntimeException(e);
}
String str1 = key1.toString().split("\\|")[0];
String str2 = key2.toString().split("\\|")[0];
return str1.compareTo(str2);
}
public int compare(Text o1, Text o2) {
return 0;
}
}
你的第二个问题要去输出是什么
LZ把第二个问题需求写清楚点吧,晚点贴代码给你看