37,721
社区成员
发帖
与我相关
我的任务
分享
use strict;
use LWP::Simple;
my $c = get('http://blog.sina.com.cn/s/blog_486e105c010095ej.html');
my ($title, $date, $tag, $category, $body);
if ($c =~ m!<div class="articleTitle">.*?</div>!s) {
($title, $date) = parseTitleDate($&);
}
if ($c =~ m!<div class="articleTag">.*?</div>!s) {
($tag, $category) = parseTagCategory($&);
}
if ($c =~ m!<div id="articleBody".*?</div>!s) {
$body = parseBody($&);
}
print "$title\n$date\n$tag\n$category\n";
print "$body\n";
sub parseTitleDate {
my $c = shift;
my $title = $1 if $c =~ m!>([^<>]*)</b>!;
my $date = $1 if $c =~ m!>\(([^<>]*)\)</span>!;
return ($title, $date);
}
sub parseTagCategory {
my $c = shift;
my $tag = $1 if $c =~ /\$tag='([^']*)'/;
my $category = $1 if $c =~ m!<td[^<>]*>[^<>]*(.*)</td>!;
$category =~ s/<[^<>]*>//g;
return $tag, $category;
}
sub parseBody {
my $c = shift;
$c =~ s/<(\S+)\s[^<>]*>/<\1>/g;
$c =~ s/\s//g;
$c =~ s!<p>(.*?)</p>!\1\n!gi;
$c =~ s!<br/?>!\n!gi;
$c =~ s/<[^<>]*>//g;
$c =~ s/ / /g;
return $c;
}
use strict;
use LWP::Simple;
my $c = get('http://blog.sina.com.cn/s/comment_486e105c010095ej_2.html');
$c =~ s!<img\s[^<>]*/>!!g;
$c =~ s!<div class="photo">.*?</div>!!g;
while ( $c =~ m!<div class="userInfo[^<>]*>\s*(<a\s[^<>]+>)?([^<>]+?)(?:</a>)?\s*</div>\s*<div[^<>]*>\s*<div>([^<>]*?)</div>\s*<span[^<>]*>\(([^<>]*)\)</span>!g) {
my $user = $2;
my $message = $3;
my $time = $4;
my $link = $1;
$link = $1 if $link =~ /href="([^"]*)"/;
print "$user\n$link\n$time\n$message\n\n";
}