求助！关于基于Hadoop的mahout聚类计算问题

hyous 2020-04-10 10:50:21

本人菜鸟一枚，尝试在虚拟机搭建的伪分布式环境下，跑mahout的kmeans算法。
数据文件保存在test.txt中，将数据转换为List<Vector>，然后通过writePointsToFile（）转换为sequenceFile
问题是这样似乎txt大小有一定限制，数据文件10M的时候可以跑，100M数据似乎就转换不过来了... ...

请问怎么改动能让它跑更大型的数据文件？？？
求解答，谢谢！！！



public class Kmeans {

	

	public static List<Vector> FileToVector(FileInputStream fs) throws IOException {

		BufferedReader br = new BufferedReader(new InputStreamReader(fs));

    	String readLine = null;

    	List<Vector> points = new ArrayList<Vector>();

    	 double []data=new double[20];

        while ((readLine = br.readLine()) != null) {

    		 String[] point=readLine.split(" ");

             for(int j=0;j<20;j++) 

            	 data[j] = Double.parseDouble(point[j]);

    		 

    		 //double[] fr = data;

    		 Vector vec = new RandomAccessSparseVector(data.length);

    		 vec.assign(data);

    		 points.add(vec);

    		 }

            br.close();

    		return points;	

    	}



public static void writePointsToFile(List<Vector> points,

										String fileName,

										FileSystem fs,

										Configuration conf) throws IOException {

		Path path = new Path(fileName);

		

		//SequenceFile.Writer（FileSystem, Configuration, Path, key.getClass(), value.getClass()）

		SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,

		path, LongWritable.class, VectorWritable.class);

		long recNum = 0;

		VectorWritable vec = new VectorWritable();

		for (Vector point : points) {

			vec.set(point);

			writer.append(new LongWritable(recNum++), vec);

			

		}

		writer.close();

	}

public static void main(String args[]) throws Exception {

		

		FileInputStream fis = new FileInputStream("E:\\Users\\ZZS\\Desktop\\结果\\new-data\\test.txt");

System.setProperty("HADOOP_USER_NAME", "root");

		int k = 4;

		List<Vector> vectors =FileToVector(fis);

		//List<Vector> vectors = getPoints(points);

 

		File testData = new File("clustering/testdata");

		if (!testData.exists()) {

			testData.mkdir();

		}

		testData = new File("clustering/testdata/points");

		if (!testData.exists()) {

			testData.mkdir();

		}

 

		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);

		//使得file1中存放可以被mahout识别的向量文件

		writePointsToFile(vectors, "clustering/testdata/points/file1", fs, conf);

 

		Path path = new Path("clustering/testdata/clusters/part-00000");

		

		SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);



for (int i = 0; i < k; i++) {

			//int n=(int) (Math.random() * points.length);  

			Vector vec = vectors.get(i);

			Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());//采用欧几里得测距方式测量两点之间的距离

			writer.append(new Text(cluster.getIdentifier()), cluster);

		}

		writer.close();

		KMeansDriver.run(conf,

		new Path("clustering/testdata/points"),     //inputPath

		new Path("clustering/testdata/clusters"),   //ClusterPath

		new Path("clustering/output"),              //OutputPath

		0.001,                                      //convergenceDelta收敛系数 

		10,                                         //MaxIterations

		true,                                       //runCLustering

		0,                                          //clusterClassificationThreshold

		true);                                      //runSequential

		

		SequenceFile.Reader reader = new SequenceFile.Reader(fs,

		new Path("clustering/output/" + "clusteredPoints"+"/part-m-0"), conf);

		

		

		File f2 = new File("E:\\结果\\hadoop\\kmeans");

		PrintStream ps = new PrintStream(f2.getPath());

		//PrintStream Console = System.out;

		System.setOut(ps);

		

		

		IntWritable key = new IntWritable();

		

		WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable();

		//ClusterWritable value=new ClusterWritable();

		while (reader.next(key, value)) {

			System.out.println(value.toString() + " belongs to cluster " + key.toString());

		}

		ps.close();

		reader.close();

	}

 

}



	]