请问smth有没有抓某个网站新闻到一个版面的程序

水木社区手机版

主题:请问smth有没有抓某个网站新闻到一个版面的程序
6楼|nfeng|2005-03-08 22:06:08|展开
我以前写过一个基于smthbbs web的抓网站新闻的php脚本，放在crontab里每天定时
去抓新闻贴到指定的版面上，好处在于结构简单、不用编译，缺点是对不同的新闻
站点需要花点心思去构造不同的过滤html标记等无用数据的正则表达式，还有就是
经常抓来的长文章会出现乱码。可以凑合着用用。

先建立一个用户用来自动贴新闻，用户名、密码和需要张贴的版面在代码里设定（
用其他方式设定好后在代码里读过来也行），代码如下：
-----------------------------------------------------------
<?php
        require("funcs.php");
        require("boards.php");

        //制造一个用来自动发文的用户，by ryu, 2005.01.10
        $data = array ();
        $id = "";       //用来自动发文的用户名
        $passwd = "";   //密码
        bbs_checkpasswd($id,$passwd);
        $loginok=0;
        $num=bbs_getcurrentuinfo($data);
        setcookie("UTMPKEY",$data["utmpkey"],0,"");
        setcookie("UTMPNUM",$num,0,"");
        setcookie("UTMPUSERID",$data["userid"],0,"");
        setcookie("LOGINTIME",$data["logintime"],0,"");

        $boardName = "ECUSTExpress";
//      $boardName = "test";
        $brdArr=array();
        $boardID= bbs_getboard($boardName,$brdArr);

        html_init("gb2312");

        $cataid = array(6, 7, 8, 10, 12, 20, 21, 22, 26, 27, 28, 30);
        $cataname = array("要闻", "综合", "教科", "学生", "后勤", "专题", "媒体", "其他", "外事", "人物", "简讯", "校园");

        for ($i=0; $i<count($cataid); $i++) {
                getcata($i, $cataid[$i]);
        }

        html_normal_quit();

function getcata($ckey, $cid)
{
        global $cataname;

        $url = "http://news.ecust.edu.cn/index_list.asp?cataid=".$cid;

        $inhandle = fopen($url, "rb");
        $listpage = "";
        do {
                $data = fread($inhandle, 8192);
                if (strlen($data) == 0) {
                        break;
                }
                $listpage .= $data;
        } while(true);
        fclose($inhandle);

        $tmp = html2txt($listpage);
        preg_match("/type=sptitle\">(.*?)\[<a/i", $tmp, $out);
        preg_match_all("/<a href(.*?)·/i", $out[1], $lines);

        while (list ($key, $line) = each($lines[1])) {
                preg_match("/=\"(.*?)\"/i", $line, $url);
                preg_match_all("/title=\"(.*?)\"/i", $line, $title);

                $tmp = explode(" ", $title[1][1]);
                $strtoday = $tmp[0];

                if ($strtoday == date("Y-n-j")) {
                        $text = getnews("http://news.ecust.edu.cn/".$url[1]);
                        $title = "[".$cataname[$ckey]."]".$title[1][0]."[zz]";
                        $text = str_replace("\r", "\n\n", $text);
                        sleep(7);
                        postnews($title, $text);
                }
        }
}

function getnews($url)
{
        $inhandle = fopen($url, "rb");
        $newspage = "";
        do {
                $data = fread($inhandle, 8192);
                if (strlen($data) == 0) {
                        break;
                }
                $newspage .= $data;
        } while(true);
        fclose($inhandle);

        $tmp = html2txt2($newspage);
        preg_match("/::\.\.(.*?)相关新闻/i", $tmp, $out);
        return $out[1];
}

function postnews($title, $text)
{
        global $boardName;

        $reID = 0;
        $signature = 0;
        $outgo = 0;     //是否转信
        $anony = 0; //是否匿名

        settype($reID, "integer");

        $articles = array();

        $ret = bbs_postarticle($boardName, preg_replace("/\\\(['|\"|\\\])/","$1",$title),
                preg_replace("/\\\(['|\"|\\\])/","$1",$text), intval($signature), $reID,
//              $text, intval($signature), $reID,
                intval($outgo), intval($anony));

        switch ($ret) {
                case -1:
                        html_error_quit("错误的讨论区名称!");
                        break;
                case -2:
                        html_error_quit("本版为二级目录版!");
                        break;
                case -3:
                        html_error_quit("标题为空!");
                        break;
                case -4:
                        html_error_quit("此讨论区是唯读的, 或是您尚无权限在此发表文章!");
                        break;
                case -5:
                        html_error_quit("很抱歉, 你被版务人员停止了本版的post权利!");
                        break;
                case -6:
                        html_error_quit("两次发文间隔过密,请休息几秒再试!");
                        break;
                case -7:
                        html_error_quit("无法读取索引文件! 请通知站务人员, 谢谢! ");
                        break;
                case -8:
                        html_error_quit("本文不可回复!");
                        break;
                case -9:
                        html_error_quit("系统内部错误, 请迅速通知站务人员, 谢谢!");
                        break;
        }
//      html_normal_quit();
}

function html2txt($document)
{
// $document should contain an HTML document.
// This will remove HTML tags, javascript sections
// and white space. It will also convert some
// common HTML entities to their text equivalent.

$search = array ("'<script[^>]*?>.*?</script>'si",  // Strip out javascript
                 "'<[^a][\/\!]*?[^<>]*?>'si",           // Strip out html tags, 不过滤"<a"开头的标签
                 "'([\r\n])[\s]+'",                 // Strip out white space
                 "'&(quot|#34);'i",                 // Replace html entities
                 "'&(amp|#38);'i",
                 "'&(lt|#60);'i",
                 "'&(gt|#62);'i",
                 "'&(nbsp|#160);'i",
                 "'&(iexcl|#161);'i",
                 "'&(cent|#162);'i",
                 "'&(pound|#163);'i",
                 "'&(copy|#169);'i",
                 "'&#(\d+);'e");                    // evaluate as php

$replace = array ("",
                  "",
                  "\\1",
                  "\"",
                  "&",
                  "<",
                  ">",
                  " ",
                  chr(161),
                  chr(162),
                  chr(163),
                  chr(169),
                  "chr(\\1)");

$text = preg_replace($search, $replace, $document);
$text = preg_replace($search, $replace, $text);
return $text;
}

function html2txt2($document)
{
// $document should contain an HTML document.
// This will remove HTML tags, javascript sections
// and white space. It will also convert some
// common HTML entities to their text equivalent.

$search = array ("'<script[^>]*?>.*?</script>'si",  // Strip out javascript
                 "'<[\/\!]*?[^<>]*?>'si",           // Strip out html tags, 不过滤"<a"开头的标签
                 "'([\r\n])[\s]+'",                 // Strip out white space
                 "'&(quot|#34);'i",                 // Replace html entities
                 "'&(amp|#38);'i",
                 "'&(lt|#60);'i",
                 "'&(gt|#62);'i",
                 "'&(nbsp|#160);'i",
                 "'&(iexcl|#161);'i",
                 "'&(cent|#162);'i",
                 "'&(pound|#163);'i",
                 "'&(copy|#169);'i",
                 "'&#(\d+);'e");                    // evaluate as php

$replace = array ("",
                  "",
                  "\\1",
                  "\"",
                  "&",
                  "<",
                  ">",
                  " ",
                  chr(161),
                  chr(162),
                  chr(163),
                  chr(169),
                  "chr(\\1)");

$text = preg_replace($search, $replace, $document);
$text = preg_replace($search, $replace, $text);
return $text;
}
?>
-----------------------------------------------------------------

【在 zbs168 (Skytiger) 的大作中提到: 】
: 屡调屡败
: 大家找个牛人一起来研究一下这个代码
: 需要改的地方不少
: ...................
--
FROM 202.120.110.*
8楼|nfeng|2005-03-08 23:29:48|展开
不错，是每次点击的时候去取新闻列表还是系统定时去取？

【在 chutium (阿新) 的大作中提到: 】
: http://www3.zzu.edu.cn/nownews.htm
: 这个玩意怎么样？
: http://www3.zzu.edu.cn/goodnews/zzjnews2004.dll/thenews?id=BA07C07114674AD2A0771E72A437C75C&tb=n_20050308
: ...................
--
FROM 202.120.110.*