java io 读入文件怎么过滤特殊字符

chenliangtadu 2013-03-05 10:14:06

1€356180814€用物质利益换取人情利益€1000224543€苏豫€用物质利益换取人情利益以上是文件内容编码格式为GBK 我想要读取文件里的内容但是用以下代码读取出来的为乱码
BufferedReader reader = null;
try{
reader = new BufferedReader(new InputStreamReader(new FileInputStream("C:/a_book_meta_20130131_1.dat"),"GBK"));
System.out.println(reader.readLine());}catch(Exception e){
e.printStackTrace();
}
试了下是特殊字符 € 的原因大家有什么解决方案谢谢搞了一天没发现什么好办法！
求哪位大神赐教！

...全文

669 13 打赏收藏转发到动态举报

写回复

用AI写文章

13 条回复

切换为时间正序

请发表友善的回复…

发表回复

chenliangtadu 2013-03-22

打赏
举报


package com.tadu.utils.ftp;

import java.io.File;
import java.io.FileInputStream;
import java.sql.Connection;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

public class FtpBookReader extends FtpCmsReader {

	public FtpBookReader(File f) {
		super(f);	
	}

	@Override
	public void readLine() {
		List<String> attrList = new ArrayList<String>();//获得每一行的属性集合 通过反射成对象 默认为字符串
		try {	
			FileInputStream reader = new FileInputStream(f);
			byte [] bs = new byte[reader.available()];//读入全部数据
			reader.read(bs);//读入数据到bs数组中
			int pointer = 0 ;//定义游标
			int len = bs.length; //初始化长度
			int line = 1; //定义到读取到多少行
			boolean flag = false;//标志是否是中文
			String sqlTemplate = null; //sql 
			Connection con = JdbcUtil.getConnection();
			con.setAutoCommit(false);
			Statement statement = con.createStatement();
			while(pointer<len){
				b=bs[pointer++];//获得字节
				m=bs[pointer];
				flag=Math.isChinese(b, m);//判断是否是中文
				String str = Math.bytesToHexString(b);//获得字节的16进制编码
				if("80".equals(str)){
					attrList.add(new String(bf.array(),0,bf.position(),"GBK"));
					bf.clear();		   
				}else if("0d".equals(Math.bytesToHexString(b))){//遇到換行符		
					if("0a".equals(Math.bytesToHexString(bs[pointer]))){
						attrList.add(new String(bf.array(),0,bf.position(),"GBK"));
						sqlTemplate = SqlUtil.sqlExtator("books.xml", SqlUtil.SQL_INSERT, attrList);
						statement.addBatch(sqlTemplate);//批量更新
						System.out.println(len + "----------" + pointer);
					    if(line%1000==0||len == pointer+1){
					    	statement.executeBatch();
					    	con.commit();
					    }
					    System.out.println("同步 全量文件 ------正在读取文件 " + f.getName() + "读取到行" + line +"------------" + "属性长度为" + attrList.size());
						System.out.println("插入一条数据成功--------------------------------------------------------------- sql" + sqlTemplate);				
						bf.clear();//清空缓存区
						attrList.clear();//清空集合数据
						pointer++;//游标前移
						line++;//记录行数增加
					}else{
						bf.put(b);
					}
				}else{
					 if(flag == true){ //如果是中文 双字节放入缓冲区 游标前移一位
						 bf.put(b);
						 bf.put(m);
						 pointer++;
					 }else{
						 bf.put(b); 
					 }								
			  }
			}
			JdbcUtil.close(statement, con);//释放连接
			reader.close();//关闭流
		} catch (Exception e) {
			System.out.println("同步全量文件出错 出错在" + attrList.get(2).toString());
			e.printStackTrace();
		}
	}
}

经过半个月时间自己搞定好累

chenliangtadu 2013-03-08

打赏
举报

package com.tadu.utils.ftp;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.dom4j.DocumentException;

import com.tadu.vo.mobile.Books;

/**
 * @author 陈亮 读取文件内容 按照文件读取内容 采用按行读取 每一行为一条记录 每一行记录采用特殊字符€ 特殊字符分隔 如果无数据也要进行分隔
 * 
 */
public class FtpCmsReader {
	//文件字节流
	private FileInputStream reader = null;
	//当前指针
	private int pointer;
	// 保存字节，便于转换为字符串
	private ByteBuffer bf = ByteBuffer.allocate(1024);
	// 读取到的字节数
	private int len;
	// 单个字节
	private byte b;
	

	/**
	 * 指定文件名的构造函数
	 * 
	 * @param filename
	 * @throws FileNotFoundException
	 */
	public FtpCmsReader(FileInputStream reader) {
		  this.reader = reader;
	}

	/**
	 * 读取一行数据
	 * 
	 * @return 数据分割好的数组
	 * @throws IOException 
	 * @throws IOException
	 * @throws DocumentException 
	 */
	public <T> List<T> readLine(Class c) throws IOException, DocumentException {		
		List<T> resultList  = new ArrayList<T>();//构造泛型集合
		List<String> attrList = new ArrayList<String>();//获得每一行的属性集合 通过反射成对象 默认为字符串
		String fileName = c.getSimpleName();// 获得实体的名称 创建规则 根据实体的名称 创建对应xml book.java -- book.xml
		List<Map<String, String>> beanAttrList = FileFtpUtil.parseXml(fileName + ".xml");
		byte [] bs = new byte[reader.available()];//直接构造文件大小字节数的字节数组
		reader.read(bs);//读入数据到bs数组中
		int len = bs.length;
		int pointer = 0;
		while(pointer<len){
			b = bs[pointer++];
			System.out.println(b);
			if(b == -128){//当是特殊字符的时候 那么根据缓存区 构造字符串对象
				attrList.add(StringUtils.trim(new String(bf.array(), 0, bf.position(),"GBK")));
				System.out.println(new String(bf.array(), 0, bf.position(),"GBK"));
				bf.clear();
			}else if(b==10||pointer==len){//如果遇到换行符号 那么 降缓存区的最后一个属性加入到属性集合中 如果当前的pointer+ 和 len 相等 那么证明读取到完毕 封装最后一个实体对象
				attrList.add(StringUtils.trim(new String(bf.array(), 0, bf.position(),"GBK")));
				System.out.println(new String(bf.array(), 0, bf.position(),"GBK"));
				System.out.println("______________________________________________________________");
				bf.clear();//清空缓存区
				//T  t  = FileFtpUtil.transAttrToBean(attrList, c,beanAttrList);
				//resultList.add(t);
				attrList.clear();//清空集合
			}else{
				bf.put(b);
			}
		}
		reader.close();
		return resultList;
	}
	
	public static void main(String[] args){
		
		try {
			FtpCmsReader reader = new FtpCmsReader(new FileInputStream("c:\\11.txt"));
			List<Books> list = reader.readLine(Books.class);
	
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (DocumentException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
}

之后现在文件中有繁体了又解析不了了直接显示成？

chenliangtadu 2013-03-07

打赏
举报

想法是对的我现在也按照你这么干的不过我还调好

miaowhehe 2013-03-07

打赏
举报

引用 9 楼 wapigzhu 的回复:

试试加入“崁，寑”随便gbk里面找几个出来含x80的都会出错

额，那就再改进一下。。。


for (int i = 0; i < arr.length; i++) {
	if (arr[i] == -128) {
		arr[i] = ' ';
	} else if (arr[i] > -128 && arr[i] < 0) {
		i++;
	}
}

这样就不会影响正常汉字了。可以在文件里保存“€123€€人情崁€，€€€寑丂”试试。单独的“€”会被替换成空格，但包含80H的正常汉字“崁寑丂”等都可以正常显示。

wapigzhu 2013-03-07

打赏
举报

引用 8 楼 miaowhehe 的回复:

Java code?1234567891011121314try { File file = new File("C:/a_book_meta_20130131_1.dat"); FileInputStream fis = new FileInputStream(file); byte[] arr = new byte[(int) file.length……

他这个问题应该是没办法判断的, 你这样解的话试试加入“崁，寑”随便gbk里面找几个出来含x80的都会出错

miaowhehe 2013-03-06

打赏
举报


try {
	File file = new File("C:/a_book_meta_20130131_1.dat");
	FileInputStream fis = new FileInputStream(file);
	byte[] arr = new byte[(int) file.length()];
	fis.read(arr);
	for (int i = 0; i < arr.length; i++) {
		if (arr[i] == -128)
			arr[i] = ' ';
	}
	String str = new String(arr, "GBK");
	System.out.println(str);
} catch (Exception e) {
	e.printStackTrace();
}

这样可以把那个见鬼的字符换成空格，亲测无乱码。

chenliangtadu 2013-03-06

打赏
举报

这个文件是运营传到ftp上合作关系文档已经写好是gbk 这个没办法改！谁有什么好办法我想只能用字节流了谁有相关的办法说下

qq1212 2013-03-06

打赏
举报

用utf-8即可，€不被gbk编码所支持

失落夏天 2013-03-05

打赏
举报

引用楼主 chenliangtadu 的回复:

1€356180814€用物质利益换取人情利益€1000224543€苏豫€用物质利益换取人情利益以上是文件内容编码格式为GBK 我想要读取文件里的内容但是用以下代码读取出来的为乱码 BufferedReader reader = null; try{ reader = new BufferedReader(new InputStreamReader(new……

首先那不是应该被过滤的字符，只是编码格式的问题。你可以使用 String s; String str=new String(s.getByte("原有格式编码"),"要转换的格式编码"); 来转换当让你非得想过滤掉那个字符的话也容易 String s="f€ewfef"; s=s.repaceAll("€",""); 这样就行了

chenliangtadu 2013-03-05