我以前写过一个基于smthbbs web的抓网站新闻的php脚本,放在crontab里每天定时
去抓新闻贴到指定的版面上,好处在于结构简单、不用编译,缺点是对不同的新闻
站点需要花点心思去构造不同的过滤html标记等无用数据的正则表达式,还有就是
经常抓来的长文章会出现乱码。可以凑合着用用。
先建立一个用户用来自动贴新闻,用户名、密码和需要张贴的版面在代码里设定(
用其他方式设定好后在代码里读过来也行),代码如下:
-----------------------------------------------------------
<?php
require("funcs.php");
require("boards.php");
//制造一个用来自动发文的用户,by ryu, 2005.01.10
$data = array ();
$id = ""; //用来自动发文的用户名
$passwd = ""; //密码
bbs_checkpasswd($id,$passwd);
$loginok=0;
$num=bbs_getcurrentuinfo($data);
setcookie("UTMPKEY",$data["utmpkey"],0,"");
setcookie("UTMPNUM",$num,0,"");
setcookie("UTMPUSERID",$data["userid"],0,"");
setcookie("LOGINTIME",$data["logintime"],0,"");
$boardName = "ECUSTExpress";
// $boardName = "test";
$brdArr=array();
$boardID= bbs_getboard($boardName,$brdArr);
html_init("gb2312");
$cataid = array(6, 7, 8, 10, 12, 20, 21, 22, 26, 27, 28, 30);
$cataname = array("要闻", "综合", "教科", "学生", "后勤", "专题", "媒体", "其他", "外事", "人物", "简讯", "校园");
for ($i=0; $i<count($cataid); $i++) {
getcata($i, $cataid[$i]);
}
html_normal_quit();
function getcata($ckey, $cid)
{
global $cataname;
$url = "http://news.ecust.edu.cn/index_list.asp?cataid=".$cid;
$inhandle = fopen($url, "rb");
$listpage = "";
do {
$data = fread($inhandle, 8192);
if (strlen($data) == 0) {
break;
}
$listpage .= $data;
} while(true);
fclose($inhandle);
$tmp = html2txt($listpage);
preg_match("/type=sptitle\">(.*?)\[<a/i", $tmp, $out);
preg_match_all("/<a href(.*?)·/i", $out[1], $lines);
while (list ($key, $line) = each($lines[1])) {
preg_match("/=\"(.*?)\"/i", $line, $url);
preg_match_all("/title=\"(.*?)\"/i", $line, $title);
$tmp = explode(" ", $title[1][1]);
$strtoday = $tmp[0];
if ($strtoday == date("Y-n-j")) {
$text = getnews("http://news.ecust.edu.cn/".$url[1]);
$title = "[".$cataname[$ckey]."]".$title[1][0]."[zz]";
$text = str_replace("\r", "\n\n", $text);
sleep(7);
postnews($title, $text);
}
}
}
function getnews($url)
{
$inhandle = fopen($url, "rb");
$newspage = "";
do {
$data = fread($inhandle, 8192);
if (strlen($data) == 0) {
break;
}
$newspage .= $data;
} while(true);
fclose($inhandle);
$tmp = html2txt2($newspage);
preg_match("/::\.\.(.*?)相关新闻/i", $tmp, $out);
return $out[1];
}
function postnews($title, $text)
{
global $boardName;
$reID = 0;
$signature = 0;
$outgo = 0; //是否转信
$anony = 0; //是否匿名
settype($reID, "integer");
$articles = array();
$ret = bbs_postarticle($boardName, preg_replace("/\\\(['|\"|\\\])/","$1",$title),
preg_replace("/\\\(['|\"|\\\])/","$1",$text), intval($signature), $reID,
// $text, intval($signature), $reID,
intval($outgo), intval($anony));
switch ($ret) {
case -1:
html_error_quit("错误的讨论区名称!");
break;
case -2:
html_error_quit("本版为二级目录版!");
break;
case -3:
html_error_quit("标题为空!");
break;
case -4:
html_error_quit("此讨论区是唯读的, 或是您尚无权限在此发表文章!");
break;
case -5:
html_error_quit("很抱歉, 你被版务人员停止了本版的post权利!");
break;
case -6:
html_error_quit("两次发文间隔过密,请休息几秒再试!");
break;
case -7:
html_error_quit("无法读取索引文件! 请通知站务人员, 谢谢! ");
break;
case -8:
html_error_quit("本文不可回复!");
break;
case -9:
html_error_quit("系统内部错误, 请迅速通知站务人员, 谢谢!");
break;
}
// html_normal_quit();
}
function html2txt($document)
{
// $document should contain an HTML document.
// This will remove HTML tags, javascript sections
// and white space. It will also convert some
// common HTML entities to their text equivalent.
$search = array ("'<script[^>]*?>.*?</script>'si", // Strip out javascript
"'<[^a][\/\!]*?[^<>]*?>'si", // Strip out html tags, 不过滤"<a"开头的标签
"'([\r\n])[\s]+'", // Strip out white space
"'&(quot|#34);'i", // Replace html entities
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'&#(\d+);'e"); // evaluate as php
$replace = array ("",
"",
"\\1",
"\"",
"&",
"<",
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(\\1)");
$text = preg_replace($search, $replace, $document);
$text = preg_replace($search, $replace, $text);
return $text;
}
function html2txt2($document)
{
// $document should contain an HTML document.
// This will remove HTML tags, javascript sections
// and white space. It will also convert some
// common HTML entities to their text equivalent.
$search = array ("'<script[^>]*?>.*?</script>'si", // Strip out javascript
"'<[\/\!]*?[^<>]*?>'si", // Strip out html tags, 不过滤"<a"开头的标签
"'([\r\n])[\s]+'", // Strip out white space
"'&(quot|#34);'i", // Replace html entities
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'&#(\d+);'e"); // evaluate as php
$replace = array ("",
"",
"\\1",
"\"",
"&",
"<",
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(\\1)");
$text = preg_replace($search, $replace, $document);
$text = preg_replace($search, $replace, $text);
return $text;
}
?>
-----------------------------------------------------------------
【 在 zbs168 (Skytiger) 的大作中提到: 】
: 屡调屡败
: 大家找个牛人一起来研究一下这个代码
: 需要改的地方不少
: ...................
--
FROM 202.120.110.*