62,614
社区成员
发帖
与我相关
我的任务
分享
private traverFolder traverfolder;
public void run() {
try
{
synchronized (traverfolder.getfileStack()) {
//选取所有的可以执行的脚本,并放入待抓取队列
doWork();
}
LOG.info("sleep " + " seconds");
Thread.sleep(1);
}
catch(Exception e)
{
}
}
private void doWork() {
//待提取的xml文件栈
Stack<String> fileStack = null;
//不允许各提取线程同时从待提取栈中取文件
synchronized (traverFolder.getInstance().getfileStack()) {
//从待提取栈中取出第一个xml
if ((fileStack = traverFolder.getInstance()
.getfileStack()) != null) {
while(!fileStack.empty())
{
String dir = fileStack.pop();
extractXML(dir);
}
} else {
// LOG.info("executableScriptQueue is empty");
try {
//阻塞抓取线程,等待可用的抓取脚本
//由脚本调度线程唤醒
LOG.info(threadID + " wait for next fileStack file");
traverFolder.getInstance().getfileStack()
.wait();
} catch (Exception e) {
LOG.error("wait for notify by fileStack error",
e);
}
}
}
class traverFolder {
private static final Log LOG = LogFactory.getLog("traverFolder");
private static Stack<String> fileStack;
//单例
private static traverFolder traverfolder;
private traverFolder() {//构造函数
//初始化xml文件栈、待提取栈为空
fileStack = new Stack<String>();
//初始化文件栈
traverDirs();
}
/**
* 单例
* @return
*/
public static traverFolder getInstance() {
if (traverfolder == null)
traverfolder = new traverFolder();
return traverfolder;
}
public synchronized Stack<String> getfileStack() {
return fileStack;
}
/**
* 遍历文件夹方法
* @param strPath
*/
public static void refreshFileList(String strPath) {
File dir = new File(strPath);
File[] files = dir.listFiles();
if (files == null)
return;
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
refreshFileList(files[i].getAbsolutePath());
} else {
String strFileName = files[i].getAbsolutePath().toLowerCase();
System.out.println("---"+strFileName);
// filelist.add(files[i].getAbsolutePath());
fileStack.push(files[i].getAbsolutePath());
}
}
}
/**
* 遍历/datasource/文件夹及其子文件夹,只要存在xml文件就进行解析,并把解析完的xml文件转移到备份文件夹/datasource_backup/,
* 注意:
*
* @param
* @return
*/
public void traverDirs()
{
System.out.println("Start!");
String tableName = fieldDicts.appItemTableNameInHbase;
HbaseInterface.createTable(tableName,"INFO",-1);
String dir_source = "/data/search_datasource_backup/";//原始文件目录
String typePaths[] = {"1/2014","2/2014","5/2014","10/2014","11/2014"};
for(int i=0;i<typePaths.length;i++)
{
try{
//遍历子目录
String dirpath = dir_source+typePaths[i];
refreshFileList(dirpath);
}catch (Exception e) {
LOG.error("Fail to deal with folder: "+dir_source+typePaths[i]);
LOG.error("[OVERALL ERROR] ", e);
}
}
}
}
import java.io.File;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 多线程抓取数据的简单程序
*/
public class MultithreadFetcher {
/** 阻塞队列的最大长度,防止内存溢出。 */
public static final int MAX_QUEUE_SIZE = 100;
/** 最大递归深度,防止递归死循环 */
public static final int RECURSION_LEVEL = Integer.MAX_VALUE;
/** 致命毒药,优雅关闭后续的工作线程 */
private static final File DEADLY_POISON = new File("./deadly.tmp");
/**
* 递归遍历文件夹,将遍历的文件放入队列中。
* @param folder 目标文件夹
* @param fileQueue 文件队列
* @param level 递归深度
*/
private static void visitFolder(File folder, BlockingQueue<File> fileQueue, int level) throws InterruptedException{
if(level<=0){//控制递归深度,防止递归死循环。
return;
}
File[] files = folder.listFiles();
for(File file : files){
if(file.isDirectory()){
visitFolder(file,fileQueue,level-1);
}else if(file.getName().toLowerCase().endsWith(".xml")){
fileQueue.put(file);
}else{
//do nothing ...
}
}
}
/**
* 创建目标文件。通过原文件的名称创建一个新的文件。
* @param file 原始文件
* @param targetFolder 目标文件夹
* @return 新的文件,目标文件
*/
private static File createTargetFile(File file, File targetFolder){
String targetFileName = file.getName();
return new File(targetFolder,targetFileName);
}
/**
* 处理文件的操作,可以在这个里面读取文件数据,解析文件,抓取网页,写入备份。
* @param file 原始文件,待解析的文件
* @param target 目标文件,备份文件
*/
private static void travelFile(File file, File target) throws Throwable{
//详细操作从略
}
/** 递归文件夹的线程。不支持多线程并发递归。 */
static class VisitFolderThread extends Thread{
private File folder;
private BlockingQueue<File> fileQueue;
public VisitFolderThread(File folder, BlockingQueue<File> fileQueue) {
super("visit-folder-thread");
this.folder = folder;
this.fileQueue = fileQueue;
}
@Override
public void run() {
try {
visitFolder(folder, fileQueue, RECURSION_LEVEL);
fileQueue.put(DEADLY_POISON);//放置毒药,优雅关闭
} catch (InterruptedException e) {
// 在这里可以做一些异常处理
e.printStackTrace();
}
}
}
/** 处理文件的线程,可以多线程并发处理,每个线程处理一个文件 */
static class TravelFileThread extends Thread{
private static final AtomicInteger ThreadCount = new AtomicInteger();
private File targetFolder;
private BlockingQueue<File> fileQueue;
public TravelFileThread(File targetFolder, BlockingQueue<File> fileQueue) {
super("travel-file-thread-"+ThreadCount.incrementAndGet());
this.targetFolder = targetFolder;
this.fileQueue = fileQueue;
}
@Override
public void run() {
File file = null;
try {
while((file=fileQueue.take())!=DEADLY_POISON){
File target = createTargetFile(file, targetFolder);
try {
travelFile(file, target);
} catch (Throwable e) {
onException(e,file,target);
}
}
fileQueue.put(DEADLY_POISON);//放置毒药,优雅关闭
} catch (InterruptedException e) {
// 在这里可以做一些异常处理
e.printStackTrace();
}
}
/** 在处理文件的过程中,如果抛出异常,则进入下面的处理程序,从略。 */
private void onException(Throwable e, File file, File target) {
// 如果travelFile抛出异常,则在此处进行处理。
e.printStackTrace();
}
}
private BlockingQueue<File> fileQueue = new LinkedBlockingQueue<File>(MAX_QUEUE_SIZE);
private Thread visitFolderThread;
private Thread[] travelFileThreads;
public MultithreadFetcher(File sourceFolder, File targetFolder, int travelThreads) {
super();
visitFolderThread = new VisitFolderThread(sourceFolder, fileQueue);
travelFileThreads = new TravelFileThread[travelThreads];
for(int i=0;i<travelFileThreads.length;i++){
travelFileThreads[i] = new TravelFileThread(targetFolder, fileQueue);
}
}
/**
* 开始执行
*/
public void start(){
visitFolderThread.start();
for(int i=0;i<travelFileThreads.length;i++){
travelFileThreads[i].start();
}
}
/**
* 强行终止。请慎用。程序会自动关闭
*/
public void terminate(){
visitFolderThread.interrupt();
for(int i=0;i<travelFileThreads.length;i++){
travelFileThreads[i].interrupt();
}
}
/**
* 测试用例
*/
public static void main(String[] args) {
final File sourceFolder = new File("");
final File targetFolder = new File("");
final int travelThreads = 20;
MultithreadFetcher fetcher = new MultithreadFetcher(sourceFolder,targetFolder,travelThreads);
fetcher.start();
}
}
traverDirs();
LOG.info("There are "+count+"xml files!");
stackList.add(stack0);
stackList.add(stack1);
stackList.add(stack2);
stackList.add(stack3);
stackList.add(stack4);
stackList.add(stack5);
stackList.add(stack6);
stackList.add(stack7);
stackList.add(stack8);
stackList.add(stack9);
try {
for (int i = 0; i < 10; i++) {
//计算最恰当的间隔长度
final int p = i;
new Thread(new Runnable() {
@Override
public void run() {
try {
LOG.info("Thread "+p+" Started!");
while(!stackList.get(p).isEmpty())
extractXML(stackList.get(p).pop());
if(stackList.get(p).isEmpty())
LOG.info("Stack "+p+" is Empty!");
} catch (Exception e) {
// TODO Auto-generated catch block
LOG.error("",e);
}
}
}).start();
}
LOG.info("Threads have fineshed!");
} catch (Exception e) {
e.printStackTrace();
}