如何获得文件的编码格式

饽饽 2009-09-25 07:49:08

我想做一个小工具，能够将文件夹下所有文件中的指定字符进行替换。可是替换后的文件经常有乱码。
别人告诉我应该先获得文件的编码格式，然后在读取文件，
请问如何获得文件的编码格式？文件在重写之前系统内都是能够正常显示的，没有乱码。
但是我用StreamReader objReader读进来是带有乱码的，因为有的文件是UTF8,有的不是

我当初是这样设计的，

使用StreamReader objReader逐行读取文件，将所有的字符放到一个ArrayList arrsource中，然后删除原文件，将ArrayList写入文件

...全文

1283 8 打赏收藏转发到动态举报

写回复

用AI写文章

8 条回复

切换为时间正序

请发表友善的回复…

发表回复

饽饽 2009-09-27

打赏
举报

3楼Dobzhansky介绍的代码的好用，非常感谢

limii 2009-09-26

打赏
举报

JGood 2009-09-26

打赏
举报

如果文件内容是由windows自带的notepad保存的，那么它会在文件的前几个字节加上bom头之类的字节。用.net程序在读的时候，不懂你设置的编码是什么，它都会根据文件头几个字节正确的读出文本的内容。但是，并不是所有的文本编辑器都会像notepad一样，有些直接将文本保存到文件里。这时候用程序读的时候，就要明确的指定文本的编码类型。编码其实这是一个很烦人的问题，可能网上有一些根据文本内容来判断其编码的程序，但并不能完全正确的判断出所有的编码类型。

http://blog.csdn.net/JGood/archive/2009/09/10/4540466.aspx

qin_wei 2009-09-26

打赏
举报

我有个类直接用就行
直接使用类的静态函数TxtFileEncoding.GetEncoding(string fileName)

using System;
using System.Text;
using System.IO;
namespace BEST.Public
{
/// <summary>
/// 用于取得一个文本文件的编码方式(Encoding)。
/// </summary>
public class TxtFileEncoding
{
/// <summary>
/// 构造
/// </summary>
public TxtFileEncoding()
{
}

/// <summary>
/// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符，Encoding.Default将被返回。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName)
{
return GetEncoding(fileName, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <returns></returns>

public static Encoding GetEncoding(FileStream stream)
{
return GetEncoding(stream, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件的编码方式。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName, Encoding defaultEncoding)
{
FileStream fs = null;
Encoding targetEncoding = defaultEncoding;
try
{
fs = new FileStream(fileName, FileMode.Open);
targetEncoding = GetEncoding(fs, defaultEncoding);
}
catch
{
}
if (fs != null)
{
fs.Close();
}
return targetEncoding;
}

/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)
{
Encoding targetEncoding = defaultEncoding;
if (stream != null && stream.Length >= 2)
{
//保存文件流的前4个字节
byte byte1 = 0;
byte byte2 = 0;
byte byte3 = 0;
byte byte4 = 0;
//保存当前Seek位置
long origPos = stream.Seek(0, SeekOrigin.Begin);
stream.Seek(0, SeekOrigin.Begin);
int nByte = stream.ReadByte();
byte1 = Convert.ToByte(nByte);
byte2 = Convert.ToByte(stream.ReadByte());
if (stream.Length >= 3)
{
byte3 = Convert.ToByte(stream.ReadByte());
}
if (stream.Length >= 4)
{
byte4 = Convert.ToByte(stream.ReadByte());
}
//根据文件流的前4个字节判断Encoding
//Unicode {0xFF, 0xFE};
//BE-Unicode {0xFE, 0xFF};
//UTF8 = {0xEF, 0xBB, 0xBF};
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
{
targetEncoding = Encoding.BigEndianUnicode;
}
else if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
{
targetEncoding = Encoding.Unicode;
}
else if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
{
targetEncoding = Encoding.UTF8;
}
else if (byte2 == 0x0)//Unicode
{
targetEncoding = Encoding.Unicode;
}
//恢复Seek位置　　　
stream.Seek(origPos, SeekOrigin.Begin);
}
return targetEncoding;
}
}
}

Dobzhansky 2009-09-25

打赏
举报

判断完全是不可能的, u8 的也不是一定有那个 BOM 头,
碰到这样的, 我现在用 ICSharpCode.TextEditor 中的一个辅助类,
代码是:



using System;

using System.IO;

using System.Text;



namespace ICSharpCode.TextEditor.Util

{

	/// <summary>

	/// Class that can open text files with auto-detection of the encoding.

	/// </summary>

	public static class FileReader

	{

		public static bool IsUnicode(Encoding encoding)

		{

			int codepage = encoding.CodePage;

			// return true if codepage is any UTF codepage

			return codepage == 65001 || codepage == 65000 || codepage == 1200 || codepage == 1201;

		}

		

		public static string ReadFileContent(string fileName, ref Encoding encoding, Encoding defaultEncoding)

		{

			using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read)) {

				using (StreamReader reader = OpenStream(fs, encoding, defaultEncoding)) {

					encoding = reader.CurrentEncoding;

					return reader.ReadToEnd();

				}

			}

		}

		

		public static StreamReader OpenStream(FileStream fs, Encoding suggestedEncoding, Encoding defaultEncoding)

		{

			if (fs.Length > 3) {

				// the autodetection of StreamReader is not capable of detecting the difference

				// between ISO-8859-1 and UTF-8 without BOM.

				int firstByte = fs.ReadByte();

				int secondByte = fs.ReadByte();

				switch ((firstByte << 8) | secondByte) {

					case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader

					case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE)

					case 0xfeff: // UTF-16 BE BOM

					case 0xefbb: // start of UTF-8 BOM

						// StreamReader autodetection works

						fs.Position = 0;

						return new StreamReader(fs);

					default:

						return AutoDetect(fs, (byte)firstByte, (byte)secondByte, defaultEncoding);

				}

			} else {

				if (suggestedEncoding != null) {

					return new StreamReader(fs, suggestedEncoding);

				} else {

					return new StreamReader(fs);

				}

			}

		}

		

		static StreamReader AutoDetect(FileStream fs, byte firstByte, byte secondByte, Encoding defaultEncoding)

		{

			int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB

			const int ASCII = 0;

			const int Error = 1;

			const int UTF8  = 2;

			const int UTF8Sequence = 3;

			int state = ASCII;

			int sequenceLength = 0;

			byte b;

			for (int i = 0; i < max; i++) {

				if (i == 0) {

					b = firstByte;

				} else if (i == 1) {

					b = secondByte;

				} else {

					b = (byte)fs.ReadByte();

				}

				if (b < 0x80) {

					// normal ASCII character

					if (state == UTF8Sequence) {

						state = Error;

						break;

					}

				} else if (b < 0xc0) {

					// 10xxxxxx : continues UTF8 byte sequence

					if (state == UTF8Sequence) {

						--sequenceLength;

						if (sequenceLength < 0) {

							state = Error;

							break;

						} else if (sequenceLength == 0) {

							state = UTF8;

						}

					} else {

						state = Error;

						break;

					}

				} else if (b >= 0xc2 && b < 0xf5) {

					// beginning of byte sequence

					if (state == UTF8 || state == ASCII) {

						state = UTF8Sequence;

						if (b < 0xe0) {

							sequenceLength = 1; // one more byte following

						} else if (b < 0xf0) {

							sequenceLength = 2; // two more bytes following

						} else {

							sequenceLength = 3; // three more bytes following

						}

					} else {

						state = Error;

						break;

					}

				} else {

					// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)

					state = Error;

					break;

				}

			}

			fs.Position = 0;

			switch (state) {

				case ASCII:

				case Error:

					// when the file seems to be ASCII or non-UTF8,

					// we read it using the user-specified encoding so it is saved again

					// using that encoding.

					if (IsUnicode(defaultEncoding)) {

						// the file is not Unicode, so don't read it using Unicode even if the

						// user has choosen Unicode as the default encoding.

						

						// If we don't do this, SD will end up always adding a Byte Order Mark

						// to ASCII files.

						defaultEncoding = Encoding.Default; // use system encoding instead

					}

					return new StreamReader(fs, defaultEncoding);

				default:

					return new StreamReader(fs);

			}

		}

	}

}