如何提取一个页面中所有的文字（纯文字，排除图片、链接、html 标签）

le616 2011-07-08 04:36:09

thanks!

...全文

577 19 打赏收藏转发到动态举报

写回复

用AI写文章

19 条回复

切换为时间正序

请发表友善的回复…

发表回复

mulk 2011-10-28

打赏
举报

学习了，正好在学这方面的呢

asuntir123 2011-10-28

打赏
举报

如果页面有大量的css呢？

zthsn 2011-07-14

打赏
举报

帮顶学习下

spring0906 2011-07-14

打赏
举报

收藏；了

haihuan23 2011-07-14

打赏
举报

mark

新时代新目标新征程 2011-07-14

打赏
举报

不错收藏之

arecaiz 2011-07-14

打赏
举报



 public static string DropHTML(string Htmlstring)

        {

            //删除脚本  

            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

            //删除HTML  

            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);



            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");

            Htmlstring.Replace(">", "");

            Htmlstring.Replace("\r\n", "");

            Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

            return Htmlstring;

        }

wang_jian_999 2011-07-14

打赏
举报

protected static string ConvertGettext(string str)
{
Regex regex = new Regex(@"\<(.*?)\>", RegexOptions.IgnoreCase);

return regex.Replace(str, "").Replace(" ", "").Replace("\n", "").Replace("\r", "");
}

weike021996 2011-07-14

打赏
举报

ASPNETCHENGXU 2011-07-14

打赏
举报

正则吧。。

realdja 2011-07-14

打赏
举报

le616 2011-07-08

打赏
举报

[Quote=引用 2 楼 q107770540 的回复:]
C# code

/// <summary>
/// 去掉html标记
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
protected static string ConvertGettext(string str)
{
……
[/Quote]
-----------------
具体情况是这样的想调用微软的翻译实现多国语言
一次传输太多内容会翻译不了，想把需要翻译的内容，分成几段传输过去
想把网页生成的html中需要翻译的内容提取出来，然后再翻译

子夜__ 2011-07-08

打赏
举报

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Net;

using System.IO;

using System.IO.Compression;

using System.Text.RegularExpressions;



namespace WikiPageCreater.Common

{

    public class PageHelper

    {

        /// <summary>

        /// 根据 url 获取网页编码

        /// </summary>

        /// <param name="url"></param>

        /// <returns></returns>

        public static string GetEncoding(string url)

        {

            HttpWebRequest request = null;

            HttpWebResponse response = null;

            StreamReader reader = null;

            try

            {

                request = (HttpWebRequest)WebRequest.Create(url);

                request.Timeout = 20000;

                request.AllowAutoRedirect = false;



                response = (HttpWebResponse)request.GetResponse();

                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

                {

                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))

                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));

                    else

                        reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);



                    string html = reader.ReadToEnd();



                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");

                    if (reg_charset.IsMatch(html))

                    {

                        return reg_charset.Match(html).Groups["charset"].Value;

                    }

                    else if (response.CharacterSet != string.Empty)

                    {

                        return response.CharacterSet;

                    }

                    else

                        return Encoding.Default.BodyName;

                }

            }

            catch

            {

            }

            finally

            {



                if (response != null)

                {

                    response.Close();

                    response = null;

                }

                if (reader != null)

                    reader.Close();



                if (request != null)

                    request = null;



            }



            return Encoding.Default.BodyName;

        }



        /// <summary>

        /// 根据 url 和 encoding 获取当前url页面的 html 源代码        

       /// </summary>

        /// <param name="url"></param>

        /// <param name="encoding"></param>

        /// <returns></returns>

        public static string GetHtml(string url, Encoding encoding)

        {

            HttpWebRequest request = null;

            HttpWebResponse response = null;

            StreamReader reader = null;

            try

            {

                request = (HttpWebRequest)WebRequest.Create(url);

                request.Timeout = 20000;

                request.AllowAutoRedirect = false;



                response = (HttpWebResponse)request.GetResponse();

                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

                {

                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))

                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);

                    else

                        reader = new StreamReader(response.GetResponseStream(), encoding);

                    string html = reader.ReadToEnd();



                    return html;

                }

            }

            catch

            {

            }

            finally

            {



                if (response != null)

                {

                    response.Close();

                    response = null;

                }

                if (reader != null)

                    reader.Close();



                if (request != null)

                    request = null;



            }



            return string.Empty;

        }

    }

}

然后正则取

mimangshamo 2011-07-08

打赏
举报

用正则去掉页面所有HTML

chenhongjun0624 2011-07-08

打赏
举报

不会帮顶

moonwrite 2011-07-08

打赏
举报

http://topic.csdn.net/u/20110508/21/b00b79a9-c90b-4a01-8ab6-225243547d04.html

q107770540 2011-07-08

打赏
举报



/// <summary>

    /// 去掉html标记

    /// </summary>

    /// <param name="str"></param>

    /// <returns></returns>

    protected static string ConvertGettext(string str)

    {

        Regex regex = new Regex(@"\<(.*?)\>", RegexOptions.IgnoreCase);



        return regex.Replace(str, "").Replace(" ", "").Replace("\n", "").Replace("\r", "");

    }

具体情况你可以发个示例来看看