1,339
社区成员




01
抓包做准备
要做一个抓取的,当然是先抓包啦~
拿出我的bp,和zoomeye篇一样先输入一个关键字,方便在bp里面找我输入的关键字
然后回车~
发现我输入的关键字在这个GET的请求包里面
用repeat模块后:
发现返回了我要的搜索结果
2
使用php的curl来模拟访问
PHP支持的由Daniel Stenberg创建的libcurl库允许你与各种的服务器使用各种类型的协议进行连接和通讯。
libcurl目前支持http、https、ftp、gopher、telnet、dict、file和ldap协议。libcurl同时也支持HTTPS认证、HTTP POST、HTTP PUT、 FTP 上传(这个也能通过PHP的FTP扩展完成)、HTTP 基于表单的上传、代理、cookies和用户名+密码的认证。
这些函数在PHP 4.0.2中被引入。
就是说,在php4.0.2中就已经引入了curl,而且还可以做post和get,真是太有用了有木有
拿出我刚刚记录好的请求包~
Connection: close
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36
Sec-Fetch-Dest: document
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Sec-Fetch-Site: none
Sec-Fetch-Mode: navigate
Sec-Fetch-User: ?1
Accept-Language: zh-CN,zh;q=0.9
Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw
然后用php语言来描述他:
<?php function curl_post($url){ //$url='https://www.baidu.com/s?wd=%22Office%20Anywhere%22&tn=93348797_hao_pg&ie=utf-8&ch=1&pn='.$pn; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, TRUE); $headers = array(); $headers[] = 'Connection: close'; $headers[] = 'Upgrade-Insecure-Requests: 1'; $headers[] = 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'; $headers[] = 'Sec-Fetch-Dest: document'; $headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'; $headers[] = 'Sec-Fetch-Site: none'; $headers[] = 'Sec-Fetch-Mode: navigate'; $headers[] = 'Sec-Fetch-User: ?1'; $headers[] = 'Accept-Language: zh-CN,zh;q=0.9'; $headers[] = 'Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw'; curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $output= curl_exec($ch); curl_close($ch); return $output; } ?>1234567891011121314151617181920212223242512345678910111213141516171819202122232425
这样子就可以完成一次php中curl对baidu的请求了
然后用正则的方法取出其中的链接(http://www.baidu.com/link?url=xxxxx)
$a=curl_post($url);
$pattern = '/<a target="_blank" href="(.*?)"(.*?)" class="(.*?)/i';
preg_match_all($pattern, $a, $match);
如果有不会的可以看我的第一篇(zoomeye篇)
)
保留head
最后就会把返回值$a打印在屏幕上了
但是百度的返回值是www.baidu.com/link?url=xxxxxxxx
所以要获取真实链接:
function get_real($url){
$info = parse_url($url);
$fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.1\r\n");
fputs($fp, "Host: {$info['host']}\r\n");
fputs($fp, "Connection: close\r\n\r\n");
$rewrite = '';
while(!feof($fp)) {
$line = fgets($fp);
if($line != "\r\n" ) {
if(strpos($line,'Location:') !== false) {
$rewrite = str_replace(array("\r","\n","Location: "),'',$line);
}
}else {
break;
}
}
return $rewrite;
}
最后输出即可
3
所以完整代码如下
<?phpfunction get_real($url){$info = parse_url($url);$fp = fsockopen($info['host'], 80,$errno, $errstr, 30);fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.1\r\n");fputs($fp, "Host: {$info['host']}\r\n");fputs($fp, "Connection: close\r\n\r\n");$rewrite = '';while(!feof($fp)) { $line = fgets($fp); if($line != "\r\n" ) { if(strpos($line,'Location:') !== false) { $rewrite = str_replace(array("\r","\n","Location: "),'',$line); } }else { break; }}return $rewrite; }function getSubstr($str, $leftStr, $rightStr){$left = strpos($str, $leftStr);//echo '左边:'.$left;$right = strpos($str, $rightStr,$left);//echo '<br>右边:'.$right;if($left < 0 or $right < $left) return '';return substr($str, $left + strlen($leftStr), $right-$left-strlen($leftStr));}function curl_post($url){ //$url='https://www.baidu.com/s?wd=%22Office%20Anywhere%22&tn=93348797_hao_pg&ie=utf-8&ch=1&pn='.$pn; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, TRUE); $headers = array(); $headers[] = 'Connection: close'; $headers[] = 'Upgrade-Insecure-Requests: 1'; $headers[] = 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'; $headers[] = 'Sec-Fetch-Dest: document'; $headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'; $headers[] = 'Sec-Fetch-Site: none'; $headers[] = 'Sec-Fetch-Mode: navigate'; $headers[] = 'Sec-Fetch-User: ?1'; $headers[] = 'Accept-Language: zh-CN,zh;q=0.9'; $headers[] = 'Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw'; curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $output= curl_exec($ch); curl_close($ch); return $output;}function get_url($page){$start='"<a target="_blank" href="';$end='" class="';$a='';/*这里是关键词*/$wddd="'Office Anywhere'";/*这里是关键词*/$url="https://www.baidu.com/s?wd=".urlencode($wddd)."&tn=93348797_hao_pg&ie=utf-8&ch=1&pn=".$page;$a=curl_post($url);$pattern = '/<a target="_blank" href="(.*?)"(.*?)" class="(.*?)/i';preg_match_all($pattern, $a, $match);$first=count($match);$aaa=array();$o=1; for($j=0;$j<=count($match[0]);$j++){ if(strpos($match[0][$j],'http://www.baidu.com/link?url=') !== false){ $match1[$o]=get_real('h'.getSubstr($match[0][$j],$start,$end)); $o++;} } return $match1;}for($i=0;$i<=500;$i+=10){ $aaa[$o]=get_url($i); $o++;}$bbb=array();$ccount=count($aaa[""]);for($i=1;$i<=$ccount;$i++){ $bbb[$i]=$aaa[""][$i];}$aaaaaaaa=count($aaa[$i]);for($i=1;$i<=count($aaa);$i++){ for($j=1;$j<=$aaaaaaaa;$j++){ if($aaa[$i][$j]!=''){echo $aaa[$i][$j].'</p>';} }}for($i=1;$i<=count($bbb);$i++){ if($bbb[$i]!=''){echo $bbb[$i].'</p>';} }?>123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
关键词要自己设定哦。我在源码中已经标明设定关键词的地方辽