先說 feed 在這裡,但不保證每天更新。(原因下述)
對於有訂閱四大報社論版 Feed 的使用者來說,相信都有很糟糕的經驗:
- 蘋果的 feed 看不到作者和完整文章
- 中時看不到作者和完整文章,再加上內嵌旺報的文章實在是一個非常低級的作法
- 聯合連摘要都沒有,非常差勁
- 自由沒有摘要之外,加上該版編輯改標題走火入魔,標題殺人很容易讓人錯過好文章
以下這支程式由PHP寫成,利用了幾個物件:
CODE:
<?php
set_time_limit( 600 );
include_once 'simplepie.class.php';
include_once 'simple_html_dom.class.php';
include_once 'snoopy.class.php';
include_once 'feedwriter.class.php';
$chinatimes = 'http://rss.chinatimes.com/rss/comment-u.rss';
$libertytimes = 'http://www.libertytimes.com.tw/rss/o.xml';
$udn = 'http://udn.com/udnrss/opinion.xml';
$apple = 'http://tw.nextmedia.com/rss/create/type/col';
$feedurl = array($apple, $chinatimes, $libertytimes, $udn);
$TestFeed = new FeedWriter(ATOM);
$TestFeed->setTitle('Taiwan Newspaper Op-ed and Editorial');
$TestFeed->setLink('http://localhost/twopedfeed.rss');
$TestFeed->setDescription('Taiwan Newspaper Op-ed aggregated from China Times, United Daily News, Liberty Times, Apple Daily');
foreach ($feedurl as $url) {
$feed = new SimplePie();
$feed->set_feed_url($url);
$feed->init();
$feed->handle_content_type();
$channeltitle = $feed->get_title();
$channeltitle = mb_convert_encoding($channeltitle, "utf-8", mb_detect_encoding($channeltitle, array('big5', 'utf-8'), true));
$max = $feed->get_item_quantity();
$newitem = array();
for ($x = 0; $x < $max; $x++) {
$item = $feed->get_item($x);
$link = $item->get_link();
$feedencoding = $feed->get_encoding();
$snoopy = new Snoopy;
$snoopy->fetch($link);
$content = $snoopy->results;
$detectencoding = mb_detect_encoding($content, array('big5', 'utf-8'), true);
$content = mb_convert_encoding($content, "utf-8", $detectencoding);
$content = str_replace($detectencoding, "utf-8", $content);
//appledaily to complete story
if (strrpos($link,"nextmedia")) {
$html = str_get_html($content);
$target = $html->find("div[id=article_content]", 0);
$target_time = $html->find("span[id=info]", 0);
$content = $target_time->innertext . $target->outertext;
$channeltitle = str_replace(array('<','>','<','>',"日報論壇總覽"), array('','','','',""), $channeltitle);
}
//udn to add summary
if (strrpos($link,"udn")) {
$html = str_get_html($content);
$target = $html->find("div[id=story]", 0);
$target_author = $html->find("div[id=story_author]", 0);
$target_time = $html->find("div[id=story_update]", 0);
$content = $target_author->innertext . $target_time->innertext . $target->outertext;
$channeltitle = str_replace(array('udn意見評論'), array('聯合'), $channeltitle);
}
//libertytimes to add summary
if (strrpos($link,"libertytimes")) {
$html = str_get_html($content);
$target = $html->find("span[id=newcontent]", 0);
$target_time = $html->find("td[id=date]", 0);
$content = $target_time->innertext . $target->outertext;
$channeltitle = str_replace(array('自由電子報--自由廣場'), array('自由'), $channeltitle);
}
//remove to WANG and complete story
if (strrpos($link,"chinatimes")) {
if ( strrpos($content,"href=\"http://www.want-daily.com\"") ) {
continue;
} else {
$html = str_get_html($content);
$target = $html->find("div[id=ctkeywordcontent]", 0);
//find time and author
$tempdiv = $html->find('div[class=articlebox]', 0);
$articlebox = $tempdiv->find('ul[class=inline-list]', 0);
$content = $articlebox->outertext . str_replace(" ",'',$target->outertext);
$channeltitle = str_replace(array('中時電子報-言論新聞'), array('中時'), $channeltitle);
}
}
$temparr = array(
'title' => $item->get_title(),
'author' => $item->get_author(),
'content' => $content,
'link' => $link,
'plink' => $item->get_permalink(),
'content' => HtmlFix(strip_tags($content, "<h1><h2><h3><h4><a><p><img><br><ul><li>"))
);
$tempnewarr = array();
foreach ($temparr as $key=>$convert) {
$tempnewarr[$key] = mb_convert_encoding($convert, "utf-8", mb_detect_encoding($convert, array('big5', 'utf-8'), true));
}
$date = $item->get_date();
if (is_int($date)) {
$tempnewarr['date'] = date('r', $date);
} else {
$tempnewarr['date'] = date('r', strtotime($item->get_date()) );
}
$newitem[] = $tempnewarr;
$newfeeditem = $TestFeed->createNewItem();
//Add elements to the feed item
$newfeeditem->setTitle("[$channeltitle] $tempnewarr[title]");
$newfeeditem->setLink($tempnewarr['link']);
$newfeeditem->setDate($tempnewarr['date']);
$newfeeditem->setDescription($tempnewarr['content']);
//Now add the feed item
$TestFeed->addItem($newfeeditem);
}
}
$output = $TestFeed->returnFeed();
echo http_request('POST', 'localhost', 80, '/puttorss.php', array(), array('content'=>base64_encode($output)) );
function http_request(
$verb = 'GET', /* HTTP Request Method (GET and POST supported) */
$ip, /* Target IP/Hostname */
$port = 80, /* Target TCP port */
$uri = '/', /* Target URI */
$getdata = array(), /* HTTP GET Data ie. array('var1' => 'val1', 'var2' => 'val2') */
$postdata = array(), /* HTTP POST Data ie. array('var1' => 'val1', 'var2' => 'val2') */
$cookie = array(), /* HTTP Cookie Data ie. array('var1' => 'val1', 'var2' => 'val2') */
$custom_headers = array(), /* Custom HTTP headers ie. array('Referer: http://localhost/ */
$timeout = 1000, /* Socket timeout in milliseconds */
$req_hdr = false, /* Include HTTP request headers */
$res_hdr = false /* Include HTTP response headers */
)
{
$ret = '';
$verb = strtoupper($verb);
$cookie_str = '';
$getdata_str = count($getdata) ? '?' : '';
$postdata_str = '';
foreach ($getdata as $k => $v)
$getdata_str .= urlencode($k) .'='. urlencode($v);
foreach ($postdata as $k => $v)
$postdata_str .= urlencode($k) .'='. urlencode($v) .'&';
foreach ($cookie as $k => $v)
$cookie_str .= urlencode($k) .'='. urlencode($v) .'; ';
$crlf = "\r\n";
$req = $verb .' '. $uri . $getdata_str .' HTTP/1.1' . $crlf;
$req .= 'Host: '. $ip . $crlf;
$req .= 'User-Agent: Mozilla/5.0 Firefox/3.6.12' . $crlf;
$req .= 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' . $crlf;
$req .= 'Accept-Language: en-us,en;q=0.5' . $crlf;
$req .= 'Accept-Encoding: deflate' . $crlf;
$req .= 'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7' . $crlf;
foreach ($custom_headers as $k => $v)
$req .= $k .': '. $v . $crlf;
if (!empty($cookie_str))
$req .= 'Cookie: '. substr($cookie_str, 0, -2) . $crlf;
if ($verb == 'POST' && !empty($postdata_str))
{
$postdata_str = substr($postdata_str, 0, -1);
$req .= 'Content-Type: application/x-www-form-urlencoded' . $crlf;
$req .= 'Content-Length: '. strlen($postdata_str) . $crlf . $crlf;
$req .= $postdata_str;
}
else $req .= $crlf;
if ($req_hdr)
$ret .= $req;
if (($fp = @fsockopen($ip, $port, $errno, $errstr)) == false)
return "Error $errno: $errstr\n";
stream_set_timeout($fp, 0, $timeout * 1000);
fputs($fp, $req);
while ($line = fgets($fp)) $ret .= $line;
fclose($fp);
if (!$res_hdr)
$ret = substr($ret, strpos($ret, "\r\n\r\n") + 4);
return $ret;
}
function HtmlFix($html) {
if(!function_exists('tidy_repair_string'))
return $html;
//use tidy to repair html code
//repair
$str = tidy_repair_string($html,
array('output-xhtml'=>true),
'utf8');
//parse
$str = tidy_parse_string($str,
array('output-xhtml'=>true),
'utf8');
$s = '';
$nodes = @tidy_get_body($str)->child;
if(!is_array($nodes)){
$returnVal = 0;
return $s;
}
foreach($nodes as $n){
$s .= $n->value;
}
return $s;
}
?>
使用這段程式碼時必須要修改 Universal Feed Generator 這隻程式,將 feedwriter 第 289 行修改為 $nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent, ENT_COMPAT, 'UTF-8'); ,才能正確處理 utf-8 資料。此外,feedwriter 本身的函數都直接以 echo 呼叫,也必須修改為回傳值(return)才能適用。
$output 就是整個 RSS 的輸出,這部分可以依自己的情形調整。以上程式包含一些我自己的作法,因為我沒有自己的主機,所以我決定每次開機的時候跑一次這個程式,然後將 output 資料利用 POST 的方式送到免費網頁主機(我用ihost)寫入某個靜態 RSS 檔以節省流量。有需要的也可以直接訂閱這個 RSS 檔,但不保證每天更新。
0 comment(s):
張貼留言