php文章内容抓取

yanfangphp 2014-08-13 09:57:36
求大神帮忙抓取这个网页http://sports.sohu.com/zhongchao.shtml的排行榜部分的数据(包括积分榜和射手榜)


...全文
241 7 打赏 收藏 转发到动态 举报
写回复
用AI写文章
7 条回复
切换为时间正序
请发表友善的回复…
发表回复
傲雪星枫 2014-08-13
  • 打赏
  • 举报
回复
sohu的页面是gb2312的,采集后需要转utf8,否则会乱码

echo '<meta http-equiv="content-type" content="text/html;charset=utf-8">';

$url = 'http://sports.sohu.com/zhongchao.shtml';
$s = file_get_contents($url);
$s = iconv('GBK','UTF8', $s); // gb2312转utf8
preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);

// 获取积分榜
preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][2],$scores);

$scoreboard = array();
for($i=0,$len=count($scores[1]); $i<$len; $i++){
	$tmp = array($scores[1][$i],strip_tags($scores[2][$i]),$scores[3][$i],$scores[4][$i]);
	array_push($scoreboard, $tmp);
}

print_r($scoreboard);

// 射手榜
preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$m[0][3],$shooters);

$shooterboard = array();
for($i=0,$len=count($shooters[1]); $i<$len; $i++){
	$tmp = array($shooters[1][$i],strip_tags($shooters[2][$i]),$shooters[3][$i],$shooters[4][$i]);
	array_push($shooterboard, $tmp);
}

print_r($shooterboard);
积分榜

Array
(
    [0] => Array
        (
            [0] => 01
            [1] => 广州恒大
            [2] => 20
            [3] => 45
        )

    [1] => Array
        (
            [0] => 02
            [1] => 北京国安
            [2] => 19
            [3] => 41
        )

    [2] => Array
        (
            [0] => 03
            [1] => 广州富力
            [2] => 19
            [3] => 34
        )

    [3] => Array
        (
            [0] => 04
            [1] => 上海东亚
            [2] => 19
            [3] => 31
        )

    [4] => Array
        (
            [0] => 05
            [1] => 贵州茅台
            [2] => 19
            [3] => 30
        )

    [5] => Array
        (
            [0] => 06
            [1] => 山东鲁能
            [2] => 19
            [3] => 28
        )

    [6] => Array
        (
            [0] => 07
            [1] => 天津泰达
            [2] => 19
            [3] => 27
        )

    [7] => Array
        (
            [0] => 08
            [1] => 江苏舜天
            [2] => 18
            [3] => 25
        )

    [8] => Array
        (
            [0] => 09
            [1] => 上海绿地
            [2] => 20
            [3] => 23
        )

    [9] => Array
        (
            [0] => 10
            [1] => 长春亚泰
            [2] => 19
            [3] => 21
        )

    [10] => Array
        (
            [0] => 11
            [1] => 杭州绿城
            [2] => 19
            [3] => 21
        )

    [11] => Array
        (
            [0] => 12
            [1] => 大连阿尔滨
            [2] => 19
            [3] => 20
        )

    [12] => Array
        (
            [0] => 13
            [1] => 上海申鑫
            [2] => 19
            [3] => 19
        )

    [13] => Array
        (
            [0] => 14
            [1] => 河南建业
            [2] => 19
            [3] => 17
        )

    [14] => Array
        (
            [0] => 15
            [1] => 辽宁宏运
            [2] => 19
            [3] => 16
        )

    [15] => Array
        (
            [0] => 16
            [1] => 哈尔滨毅腾
            [2] => 18
            [3] => 12
        )

)
射手榜

Array
(
    [0] => Array
        (
            [0] => 01
            [1] => 埃尔克森
            [2] => 17
            [3] => 广州恒大
        )

    [1] => Array
        (
            [0] => 02
            [1] => 哈默德
            [2] => 16
            [3] => 广州富力
        )

    [2] => Array
        (
            [0] => 03
            [1] => 海森
            [2] => 13
            [3] => 上海东亚
        )

    [3] => Array
        (
            [0] => 04
            [1] => 达维
            [2] => 9
            [3] => 广州富力
        )

    [4] => Array
        (
            [0] => 04
            [1] => 多利
            [2] => 9
            [3] => 哈尔滨毅腾
        )

    [5] => Array
        (
            [0] => 04
            [1] => 洛维
            [2] => 9
            [3] => 山东鲁能
        )

    [6] => Array
        (
            [0] => 04
            [1] => 拉蒙
            [2] => 9
            [3] => 杭州绿城
        )

    [7] => Array
        (
            [0] => 08
            [1] => 德扬
            [2] => 8
            [3] => 北京国安
        )

    [8] => Array
        (
            [0] => 09
            [1] => 巴塔拉
            [2] => 7
            [3] => 北京国安
        )

    [9] => Array
        (
            [0] => 09
            [1] => 布鲁诺
            [2] => 7
            [3] => 大连阿尔滨
        )

    [10] => Array
        (
            [0] => 09
            [1] => 里卡多
            [2] => 7
            [3] => 哈尔滨毅腾
        )

    [11] => Array
        (
            [0] => 09
            [1] => 武磊
            [2] => 7
            [3] => 上海东亚
        )

    [12] => Array
        (
            [0] => 09
            [1] => 埃尼奥
            [2] => 7
            [3] => 长春亚泰
        )

    [13] => Array
        (
            [0] => 09
            [1] => 尤里
            [2] => 7
            [3] => 贵州茅台
        )

    [14] => Array
        (
            [0] => 15
            [1] => 莫雷诺
            [2] => 6
            [3] => 上海绿地
        )

    [15] => Array
        (
            [0] => 15
            [1] => 雷内
            [2] => 6
            [3] => 广州恒大
        )

)
yanfangphp 2014-08-13
  • 打赏
  • 举报
回复
引用 2 楼 xuzuning 的回复:
$url = 'http://sports.sohu.com/zhongchao.shtml';
$s = file_get_contents($url);
preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);
print_r(preg_grep('/名次/', $m[0]));
Array
(
    [2] => 
<table border=0 cellSpacing=0 cellPadding=0 width="100%">
<tbody>
<tr>
<th width="15%">名次</th>
<th width="47%">球队</th>
<th width="9%">场次</th>
<th width="29%">积分</th></tr>
<tr>
<td>01</td>
<td><a href="http://sports.sohu.com/s2010/7742/s277701524/" target="_blank">广州恒大</a></td>
<td>20</td>
<td>45</td>
</tr>
<tr>
<td>02</td>
<td><a href="http://sports.sohu.com/s2006/7742/s242155493/" target="_blank">北京国安</a></td>
......
接下来自己做
我输出出来的怎么是一个空数组
果酱很好吃 2014-08-13
  • 打赏
  • 举报
回复
$str=file_get_contents("http://sports.sohu.com/zhongchao.shtml");

preg_match_all('/<tr>\s*<td>(.+?)<\/td>\s*<td>(.+?)<\/td>\s*<td>(\d+)<\/td>\s*<td>(.+?)<\/td>\s*<\/tr>/i',$str,$match1);

foreach($match1 as $k=>$v){
	if($k!=0){
		foreach($v as $k1=>$v1){
			if($k1<=15){
				$jifen[$k][]=$v1;
			}else{
				$sheshou[$k][]=$v1;
			}
		}
	}
}
echo "<pre>";
print_r($jifen);
print_r($sheshou);
echo "</pre>";
/*
Array
(
    [1] => Array
        (
            [0] => 01
            [1] => 02
            [2] => 03
            [3] => 04
            [4] => 05
            [5] => 06
            [6] => 07
            [7] => 08
            [8] => 09
            [9] => 10
            [10] => 11
            [11] => 12
            [12] => 13
            [13] => 14
            [14] => 15
            [15] => 16
        )

    [2] => Array
        (
            [0] => 广州恒大
            [1] => 北京国安
            [2] => 广州富力
            [3] => 上海东亚
            [4] => 贵州茅台
            [5] => 山东鲁能
            [6] => 天津泰达
            [7] => 江苏舜天
            [8] => 上海绿地
            [9] => 长春亚泰
            [10] => 杭州绿城
            [11] => 大连阿尔滨
            [12] => 上海申鑫
            [13] => 河南建业
            [14] => 辽宁宏运
            [15] => 哈尔滨毅腾
        )

    [3] => Array
        (
            [0] => 20
            [1] => 19
            [2] => 19
            [3] => 19
            [4] => 19
            [5] => 19
            [6] => 19
            [7] => 18
            [8] => 20
            [9] => 19
            [10] => 19
            [11] => 19
            [12] => 19
            [13] => 19
            [14] => 19
            [15] => 18
        )

    [4] => Array
        (
            [0] => 45
            [1] => 41
            [2] => 34
            [3] => 31
            [4] => 30
            [5] => 28
            [6] => 27
            [7] => 25
            [8] => 23
            [9] => 21
            [10] => 21
            [11] => 20
            [12] => 19
            [13] => 17
            [14] => 16
            [15] => 12
        )

)
Array
(
    [1] => Array
        (
            [0] => 01
            [1] => 02
            [2] => 03
            [3] => 04
            [4] => 04
            [5] => 04
            [6] => 04
            [7] => 08
            [8] => 09
            [9] => 09
            [10] => 09
            [11] => 09
            [12] => 09
            [13] => 09
            [14] => 15
            [15] => 15
        )

    [2] => Array
        (
            [0] => 埃尔克森
            [1] => 哈默德
            [2] => 海森
            [3] => 达维
            [4] => 多利
            [5] => 洛维
            [6] => 拉蒙
            [7] => 德扬
            [8] => 巴塔拉
            [9] => 布鲁诺
            [10] => 里卡多
            [11] => 武磊
            [12] => 埃尼奥
            [13] => 尤里
            [14] => 莫雷诺
            [15] => 雷内
        )

    [3] => Array
        (
            [0] => 17
            [1] => 16
            [2] => 13
            [3] => 9
            [4] => 9
            [5] => 9
            [6] => 9
            [7] => 8
            [8] => 7
            [9] => 7
            [10] => 7
            [11] => 7
            [12] => 7
            [13] => 7
            [14] => 6
            [15] => 6
        )

    [4] => Array
        (
            [0] => 广州恒大
            [1] => 广州富力
            [2] => 上海东亚
            [3] => 广州富力
            [4] => 哈尔滨毅腾
            [5] => 山东鲁能
            [6] => 杭州绿城
            [7] => 北京国安
            [8] => 北京国安
            [9] => 大连阿尔滨
            [10] => 哈尔滨毅腾
            [11] => 上海东亚
            [12] => 长春亚泰
            [13] => 贵州茅台
            [14] => 上海绿地
            [15] => 广州恒大
        )

)
*/
后面的自己处理吧
xuyanlu 2014-08-13
  • 打赏
  • 举报
回复
给你推荐个类 simple_html_dom


include "simple_html_dom.class.php";

$url = "http://sports.sohu.com/zhongchao.shtml";
$dom = new simple_html_dom();
$html = $dom->load(file_get_contents($url));

$res = $html->find("div#turnIDB div.turn");
# 积分榜
echo $res[0]->outertext;
# 射手榜
echo $res[1]->outertext;


结果
猪崽儿0o0 2014-08-13
  • 打赏
  • 举报
回复
可以使用preg_match去抓取对应的HTML代码然后再正则过滤你想要的数据即可。
xuzuning 2014-08-13
  • 打赏
  • 举报
回复
$url = 'http://sports.sohu.com/zhongchao.shtml';
$s = file_get_contents($url);
preg_match_all('/(?<=<div class="turn cons">)\s<table.+table>/isU', $s, $m);
print_r(preg_grep('/名次/', $m[0]));
Array
(
    [2] => 
<table border=0 cellSpacing=0 cellPadding=0 width="100%">
<tbody>
<tr>
<th width="15%">名次</th>
<th width="47%">球队</th>
<th width="9%">场次</th>
<th width="29%">积分</th></tr>
<tr>
<td>01</td>
<td><a href="http://sports.sohu.com/s2010/7742/s277701524/" target="_blank">广州恒大</a></td>
<td>20</td>
<td>45</td>
</tr>
<tr>
<td>02</td>
<td><a href="http://sports.sohu.com/s2006/7742/s242155493/" target="_blank">北京国安</a></td>
......
接下来自己做
liu510817387 2014-08-13
  • 打赏
  • 举报
回复
抓取 研究研究 phpquery

21,886

社区成员

发帖
与我相关
我的任务
社区描述
从PHP安装配置,PHP入门,PHP基础到PHP应用
社区管理员
  • 基础编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧