爬取页面需要登陆才可爬取，这种怎么解决

cainiao13579 2013-01-21 08:58:57

如题，就是如果要爬取某个页面，但它必须要你在它的登陆页面，登陆后，内容才可以显示出来，请问这种的是怎么解决，谢了。

...全文

1306 13 打赏收藏转发到动态举报

写回复

用AI写文章

13 条回复

切换为时间正序

请发表友善的回复…

发表回复

HeraLu 2013-01-25

打赏
举报

要看有没有验证码，有验证码比较麻烦，没有验证码的话，用WebBrowser控件，添加引用Microsoft.mshtml


using mshtml;
namespace Parser
{
    class Spider:Form
    {
        private void Spider_Load(object sender, EventArgs e)
        {
            webBrowser1.Navigate("登录界面的URL");
        }
        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            if (e.Url.ToString() != webBrowser1.Url.ToString())
                return;
            if (webBrowser1.ReadyState != WebBrowserReadyState.Complete)
                return;
        }
        private void button1_Click(object sender, EventArgs e)
        {
            string htmlDoc = (mshtml.HTMLDocument)webBrowser1.Document.DomDocument;
            IHTMLElementCollection eCollection = htmlDoc.all;
            foreach (IHTMLElement element in eCollection)
            {
                //假设该页面用户名密码的输入框都是放在input中的
                if (element.tagName.ToLower() == "input")
                {
                    //假设用户名输入框input的name属性值是username，同理密码输入框属性值是password。即<input ... name=username ...>和<input ... name=password...>，不一定非要用name属性，id什么也可以，只要能区分就可以
                    object attrtext = element.getAttribute("name", 0);
                    if (attrtext != null)
                    {
                        if (attrtext.ToString() == "username")
                            element.setAttribute("value", "自己赋值");
                        if (attrtext.ToString() == "password")
                            element.setAttribute("value", "自己赋值");
                    }
                    //假设网页上的提交按钮源代码是<input type='submit' name='submit' value=' 提 交 '>    
                    object attr = element.getAttribute("type", 0);
                    if (attr != null)
                    {
                        if (attr.ToString() == "submit")
                        {
                            element.click();
                        }
                    }
                }
            }
        }
    }
}

这就完成模拟登录啦！登录完之后再string htmlDoc = (mshtml.HTMLDocument)webBrowser1.Document.DomDocument;就爬取到了登录后的源代码了！

一片冰心在玉壶 2013-01-25

打赏
举报

模拟用户登录使用HttpWebRequest发送用户信息

wansai00 2013-01-25

打赏
举报

手工登陆拿到当前Cookie 在你的程序里抓页面的时候把Cookie一并传过去

枫c_2012 2013-01-25

打赏
举报

看到很不错、、不错、学学

txiangsun 2013-01-25

打赏
举报

关注下

yoyo_ 2013-01-25

打赏
举报

这个也就是模拟登录，没有验证码的好办. 用抓包工具wireshark看下那个网站登录是post到哪的，需要带哪些参数，登录成功后再请求需要的数据页,最近也有做过类似的功能，代码你可以参考下.


         public HttpWebResponse PostData(string strURL, string strArgs, string strReferer, string code, string method, CookieContainer cookieContainer)
        {
            return PostData(strURL, strArgs, strReferer, code, method, string.Empty,cookieContainer);
        }
        public HttpWebResponse PostData(string strURL, string strArgs, string strReferer, string code, string method, string contentType, CookieContainer cookieContainer)
        {
            try
            {
                HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
                myHttpWebRequest.AllowAutoRedirect = true;
                myHttpWebRequest.KeepAlive = true;
                myHttpWebRequest.Accept = "application/json, text/javascript, */*";
                myHttpWebRequest.Referer = strReferer;

                myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.96 Safari/537.4";
                if (string.IsNullOrEmpty(contentType))
                {
                    myHttpWebRequest.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
                }
                else
                {
                    myHttpWebRequest.ContentType = "contentType";
                }

                myHttpWebRequest.Method = method;
                myHttpWebRequest.Headers.Add("Accept-Encoding", "gzip, deflate,sdch");

                if (cookieContainer == null)
                {
                    cookieContainer = new CookieContainer();
                }
                myHttpWebRequest.CookieContainer = cookieContainer;
                byte[] postData = Encoding.GetEncoding(code).GetBytes(strArgs);
                myHttpWebRequest.ContentLength = postData.Length;
                System.IO.Stream PostStream = myHttpWebRequest.GetRequestStream();
                PostStream.Write(postData, 0, postData.Length);
                PostStream.Close();

                HttpWebResponse response = null;
                response = (HttpWebResponse)myHttpWebRequest.GetResponse();
                return response;
            }
            catch (Exception ex)
            {
                string s = "出错了：" + ex.Message;
                return null;
            }
        }

          private void login_ajax()
        {
            string username = Request.Form["username"];
            //登录地址
            string LOGIN_URL = "http://mp.weixin.qq.com/cgi-bin/login?lang=zh_CN";
            //登录请求来源地址
            string LOGIN_REFERER = "http://mp.weixin.qq.com/cgi-bin/loginpage?t=wxm-login&lang=zh_CN";
            CookieContainer cookie = new CookieContainer();
            if (Session["login_verify_code"] != null)
            {
                cookie = Session["login_verify_code"] as CookieContainer;
            }
            string is_update = Request.QueryString["is_update"] ?? "";
            string pwd1 = Request.Form["pwd1"];
            string pwd2 = Request.Form["pwd2"];
            string imgcode = Request.Form["imgcode"];
            string register = Request.Form["register"];
            string f = Request.Form["f"];
            //拼接请求参数
            string strArgs = "&username=" + username;
            strArgs += "&pwd1=" + pwd1;
            strArgs += "&pwd2=" + pwd2;
            strArgs += "&imgcode=" + imgcode;
            strArgs += "&f="+f;
            HttpWebResponse http_response = pt.PostData(LOGIN_URL, strArgs, LOGIN_REFERER, CODE, MOTHED, cookie);
            StreamReader reader = new StreamReader(http_response.GetResponseStream(), Encoding.Default);
            content = reader.ReadToEnd();

            Response.Write(content);
            reader.Close();
            http_response.Close();
            Session["username"]=username;
            Session["pwd"] = pwd1;
            Session["login_wx"] = cookie;
            
            //此处是登录成功后要取的网页
            string temp_url = "http://mp.weixin.qq.com/cgi-bin/userinfopage?t=wxm-setting&lang=zh_CN";
            HttpWebResponse http_response_get = pt.GetResponseByGet(temp_url, cookie);

            StreamReader reader1 = new StreamReader(http_response_get.GetResponseStream(), Encoding.Default);
            string content1 = reader1.ReadToEnd();
          
        }