<?php
// $Id: rssc_xml_utility.php,v 1.12 2006/12/02 17:42:12 ohwada Exp $

// 2006-12-02 K.OHWADA
// BUG 4389: cannot auto discovery RDF url

// 2006-11-08 K.OHWADA
// add set_proxy()

// 2006-09-20 K.OHWADA
// move discover_for_manage() from rssc_link_exist_handler
// use set_error_code()

// 2006-07-10 K.OHWADA
// use happy_linux_error happy_linux_remote_file etc

// 2006-06-04 K.OHWADA
// add DEFAULT_ENCODINGS
// move get_unixtime_rfc822(), get_unixtime_w3cdtf() from parse_base
// move parse_by_url() to parse_handler
// suppress notice : Only variable references should be returned by reference

//=========================================================
// Rss Center Module
// 2006-01-01 K.OHWADA
//=========================================================

// === class begin ===
if( !class_exists('rssc_xml_utility') ) 
{

//=========================================================
// class rssc_xml_utility
//=========================================================
class rssc_xml_utility extends happy_linux_error
{
// class instance
//	var $_xml_parser;
	var $_remote;
	var $_convert;
	var $_strings;

// basic config
	var $_sel_priority = RSSC_C_SEL_ATOM;

// result
	var $_html_data;
	var $_xml_data;
	var $_xml_mode;
	var $_rdf_url;
	var $_rss_url;
	var $_atom_url;
	var $_xml_kind;
	var $_xml_encoding_detected;
	var $_result_code = 0;

	var $_KNOWN_ENCODINGS    = array('utf-8', 'us-ascii', 'iso-8859-1');
	var $_DEFAULT_ENCODINGS  = 'utf-8';
	var $_SOURCE_ENCODINGS   = 'utf-8';

//---------------------------------------------------------
// constructor
//---------------------------------------------------------
function rssc_xml_utility()
{
	$this->happy_linux_error();

// class instance
	$this->_remote     =& happy_linux_remote_file::getInstance();
	$this->_convert    =& happy_linux_convert_encoding::getInstance();
	$this->_strings    =& happy_linux_strings::getInstance();

}

function &getInstance()
{
	static $instance;
	if (!isset($instance)) 
	{
		$instance = new rssc_xml_utility();
	}

	return $instance;
}

//---------------------------------------------------------
// init
//---------------------------------------------------------
function set_proxy( $host, $port='8080', $user='', $pass='' )
{
	$this->_remote->set_snoopy_proxy( $host, $port, $user, $pass );
}

//=========================================================
// public
//=========================================================
//---------------------------------------------------------
// discover XML link
//---------------------------------------------------------
function discover($html_url, $sel='')
{
	$this->_set_log_func_name('discover');

	if ( empty($sel) )
	{
		$sel = $this->_sel_priority;
	}

	$html_data = $this->read_html($html_url);
	if ( empty($html_data) )
	{
		return false;
	}

	list($rdf_url, $rss_url, $atom_url) = $this->find_link($html_data, $html_url);

	if ( ($sel == RSSC_C_SEL_ATOM) && $atom_url )
	{
		$xml_mode = RSSC_C_MODE_ATOM;
	}
	elseif ( ($sel == RSSC_C_SEL_RSS) && $rss_url )
	{
		$xml_mode = RSSC_C_MODE_RSS;
	}
	elseif ( ($sel == RSSC_C_SEL_RDF) && $rdf_url )
	{
		$xml_mode = RSSC_C_MODE_RDF;
	}
	elseif ( $atom_url )
	{
		$xml_mode = RSSC_C_MODE_ATOM;
	}
	elseif ( $rss_url )
	{
		$xml_mode = RSSC_C_MODE_RSS;
	}
	elseif ( $rdf_url )
	{
		$xml_mode = RSSC_C_MODE_RDF;
	}
	else
	{
		return false;
	}

	$this->_xml_mode = $xml_mode;
	$this->_rdf_url  = $rdf_url;
	$this->_rss_url  = $rss_url;
	$this->_atom_url = $atom_url;

	return true;
}

//---------------------------------------------------------
// check_exist_rssurl
// for admin/link_manage.php
//---------------------------------------------------------
function discover_for_manage( $mode, $url, $rdf_url, $rss_url, $atom_url, $sel )
{
	$ret_code = 0;

// RSS auto discovery
	if ( $mode == RSSC_C_MODE_AUTO )
	{
		$ret = $this->discover($url, $sel);
		if ( $ret )
		{
			$ret_code      = RSSC_CODE_DISCOVER_SUCCEEDED;
			$mode          = $this->get_xml_mode();
			$auto_rdf_url  = $this->get_rdf_url();
			$auto_rss_url  = $this->get_rss_url();
			$auto_atom_url = $this->get_atom_url();

			if ( $auto_rdf_url )
			{
				$rdf_url = $auto_rdf_url;
			}

			if ( $auto_rss_url )
			{
				$rss_url = $auto_rss_url;
			}

			if ( $auto_atom_url )
			{
				$atom_url = $auto_atom_url;
			}
		}
		else
		{
// cannot discover xml link
			$ret_code = RSSC_CODE_DISCOVER_FAILED;
			$this->_set_errors( "cannot discover xml link" );
			$this->_set_errors( $this->getErrors() );
		}
	}

	$this->_xml_mode = $mode;
	$this->_rdf_url  = $rdf_url;
	$this->_rss_url  = $rss_url;
	$this->_atom_url = $atom_url;

	return $ret_code;
}

//---------------------------------------------------------
// read remote HTML
//---------------------------------------------------------
function read_html($url)
{
	$this->_set_log_func_name('read_html');

// read remote XML
	$data = $this->_remote->read_file($url);

	if ( !$data )
	{
		$this->_set_error_code( $this->_remote->getErrorCode() );
		$this->_set_errors(     $this->_remote->getErrors() );
		return false;
	}

	$this->_html_data = $data;
	return $data;
}

//---------------------------------------------------------
// read remote XML
//---------------------------------------------------------
function read_xml($url)
{
	$this->_set_log_func_name('read_xml');

// read remote XML
	$data = $this->_remote->read_file($url);

	if (!$data)
	{
		$this->_set_error_code( $this->_remote->getErrorCode() );
		$this->_set_errors(     $this->_remote->getErrors() );
		return false;
	}

	$this->_xml_data = $data;
	return $data;
}

//---------------------------------------------------------
// find XML link: auto discovery
//---------------------------------------------------------
function find_link($html_data, $html_url='')
{
	$this->_set_log_func_name('find_link');

	list($rdf_url, $rss_url, $atom_url)
		= $this->_find_xml_link($html_data, $html_url);

	if ( empty($rdf_url) && empty($rss_url) && empty($atom_url) )
	{
		$this->_set_errors( "cannot find xml link: url = $html_url" );
	}

	return array($rdf_url, $rss_url, $atom_url);
}

//---------------------------------------------------------
// find XML encoding
//---------------------------------------------------------
function find_encoding($xml)
{
	$this->_set_log_func_name('find_encoding');

	$encoding = $this->_find_xml_encoding($xml);
	if ( $encoding )
	{
		$this->_xml_encoding_detected = $encoding;
	}
	else
	{
		if ( $this->_DEFAULT_ENCODINGS )
		{
			$encoding = $this->_DEFAULT_ENCODINGS;
			$this->_result_code = RSSC_CODE_XML_ENCODINGS_DEFAULT;
		}
		else
		{
			$this->_set_errors( "cannot find xml encoding" );
			return false;
		}
	}

	$encoding_orig = strtolower( $encoding );
	return $encoding_orig;
}


//---------------------------------------------------------
// find XML mode
//---------------------------------------------------------
function find_kind($xml)
{
	$this->_set_log_func_name('find_kind');

	$kind = $this->_find_xml_kind($xml);
	if (!$kind)
	{
		$this->_set_errors( "cannot find xml kind" );
		return false;
	}

	switch ($kind)
	{
		case 'rdf':
			$mode = RSSC_C_MODE_RDF;
			break;

		case 'rss':
			$mode = RSSC_C_MODE_RSS;
			break;

		case 'atom':
			$mode = RSSC_C_MODE_ATOM;
			break;

		default:
			$this->_set_errors( "cannot find xml kind" );
			return false;
			break;
	}

	$this->_xml_kind = $kind;
	$this->_xml_mode = $mode;
	return $mode;

}

//---------------------------------------------------------
// convert XML to parse
//---------------------------------------------------------
function convert_to_parse($xml_data, $xml_encoding)
{
	$this->_set_log_func_name('convert_to_parse');
	$ret = $this->_convert_xml_to_parse($xml_data, $xml_encoding);
	return $ret;
}

//---------------------------------------------------------
// set param
//---------------------------------------------------------
function set_priority($value)
{
	$this->_sel_priority = $value;
}

function set_encoding_local($value)
{
	$this->_xml_encoding_local = $value;
}

//---------------------------------------------------------
// get result of auto discovery
//---------------------------------------------------------
function get_html()
{
	return $this->_html_data;
}

function get_xml_mode()
{
	return $this->_xml_mode;
}

function get_rdf_url()
{
	return $this->_rdf_url;
}

function get_rss_url()
{
	return $this->_rss_url;
}

function get_atom_url()
{
	return $this->_atom_url;
}

function get_xmlurl_by_mode()
{
	switch ( $this->_xml_mode )
	{
		case RSSC_C_MODE_RDF:
			return $this->_rdf_url;
			break;

		case RSSC_C_MODE_RSS:
			return $this->_rss_url;
			break;

		case RSSC_C_MODE_ATOM:
			return $this->_atom_url;
			break;
	}

	return false;
}

//---------------------------------------------------------
// get result of parse
//---------------------------------------------------------
function get_xml()
{
	return $this->_xml_data;
}

function get_xml_kind()
{
	return $this->_xml_kind;
}

function get_result_code()
{
	return $this->_result_code;
}


//=========================================================
// private
//=========================================================

//---------------------------------------------------------
// find RDF/RSS/ATOM link in HTML
//---------------------------------------------------------
// <link rel="alternate" type="application/rdf+xml"  title="RDF" href="xxx" /> 
// <link rel="alternate" type="application/rss+xml"  title="RSS" href="xxx" /> 
// <link rel="alternate" type="application/atom+xml" title="ATOM" href="xxx" /> 
//---------------------------------------------------------
function _find_xml_link($html, $url='')
{
	$href_rdf  = '';
	$href_rss  = '';
	$href_atom = '';

// save all <link> tags
	preg_match_all('/<link\s+(.*?)\s*\/?>/si', $html, $match);
	$link_tag_arr = $match[1];

	$link_arr = array();
	$link_tag_count = count($link_tag_arr);

// store each <link> tags's attributes
	for($i=0; $i<$link_tag_count; $i++)
	{
		$attr_wk_arr   = array();
		$link_attr_arr = preg_split('/\s+/s', $link_tag_arr[$i]);

		foreach($link_attr_arr as $link_attr)
		{
			$link_attr_pair = preg_split('/\s*=\s*/s', $link_attr, 2);

			if( isset($link_attr_pair[0]) && isset($link_attr_pair[1]) )
			{
				$key   = $link_attr_pair[0];
				$value = $link_attr_pair[1];
				$key   = strtolower( $key );
				$value = preg_replace('/([\'"]?)(.*)\1/', '$2', $value);
				$attr_wk_arr[$key] = $value;
			}
		}

		$link_arr[$i] = $attr_wk_arr;
	}

// find the link file
	for($i=0; $i<$link_tag_count; $i++)
	{
		if ( !isset($link_arr[$i]['rel']) )   continue;
		if ( !isset($link_arr[$i]['type']) )  continue;
		if ( !isset($link_arr[$i]['href']) )  continue;

		$rel  = strtolower( $link_arr[$i]['rel'] );
		$type = strtolower( $link_arr[$i]['type'] );
		$href = $link_arr[$i]['href'];

		if ( $rel != 'alternate')  continue;

		if (empty($href_rdf) && ($type == 'application/rdf+xml'))
		{
// BUG 4389: cannot auto discovery RDF url
			$href_rdf = $href;
		}
		elseif (empty($href_rss) && ($type == 'application/rss+xml'))
		{
			$href_rss = $href;
		}
		elseif (empty($href_atom) && ($type == 'application/atom+xml'))
		{
			$href_atom = $href;
		}
	}

	if ($url)
	{
		$href_rdf  = $this->_relative_to_full_url($href_rdf,  $url);
		$href_rss  = $this->_relative_to_full_url($href_rss,  $url);
		$href_atom = $this->_relative_to_full_url($href_atom, $url);
	}

	return array($href_rdf, $href_rss, $href_atom);
}

//---------------------------------------------------------
// relative_to_full_url
//---------------------------------------------------------
function _relative_to_full_url($url, $url_html)
{
	if ( empty($url) )  return '';

// start from "/"
	if ( ereg("^\/", $url) ) 
	{
		$domain = '';

	// "http://domain/***/"
		if ( preg_match("/http:\/\/(.*?)\/.*/", $url_html, $match) ) 
		{
			$domain   = $match[1];
		}

		$url_full = "http://".$domain.$url;
	}
// not start from "http"
	elseif ( !ereg("^http", $url) ) 
	{
		$dir = $url_html;

	// "dir/***/"
		if ( preg_match("/^(.*)\/(.*\..*)$/", $dir, $match) )
		{
			$dir = $match[1];
		}

		$url_full = $dir."/".$url;
	}
// maybe full url
	else
	{
		$url_full = $url;
	}

	return $url_full;
}

//---------------------------------------------------------
// find XML encoding
// < ? xml version="1.0" encoding="UTF-8" ? >
//---------------------------------------------------------
function _find_xml_encoding($text)
{
	if ( preg_match('/<\?xml(.*?)\?>/si', $text, $match1) )
	{
		$line = $match1[1];

		if ( preg_match('/encoding="(.*?)"/si', $line, $match2) )
		{
			$ret = $match2[1];
			return $ret;
		}
	}

	return false;
}

//---------------------------------------------------------
// find XML mode
// < rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" >
// < rss version="2.0" >
// < feed version="0.3" xmlns="http://purl.org/atom/ns#" >
// < feed xmlns="http://www.w3.org/2005/Atom" >
//---------------------------------------------------------
function _find_xml_kind($text)
{
	if ( preg_match('/<rdf:RDF(.*?)>/si', $text) )
	{
		return 'rdf';
	}

	if ( preg_match('/<rss(.*?)>/si', $text) )
	{
		return 'rss';
	}

	if ( preg_match('/<feed(.*?)>/si', $text, $match1) )
	{
		$line = $match1[1];

		if ( preg_match('/atom/si', $line) )
		{
			return 'atom';
		}
	}

	return false;
}

//---------------------------------------------------------
// convert xml to parse
//---------------------------------------------------------
function _convert_xml_to_parse($xml, $encoding)
{

// not convert, if PHP default
//	if ( ($encoding == 'utf-8') || ($encoding == 'us-ascii') || ($encoding == 'iso-8859-1') )
	if ( $this->_check_known_encoding($encoding) )

	{
		$xml_cleaned = $this->_cleanup_xml( $xml );
		return array($xml_cleaned, $encoding);
	}

// convert
	elseif ($encoding)
	{
		$encoding_converted = $this->_SOURCE_ENCODINGS;
		$xml_converted = $this->_convert->convert($xml, $encoding_converted, $encoding);
		$xml_cleaned = $this->_cleanup_xml( $xml_converted );
		return array($xml_cleaned, $encoding_converted);
	}

// no action
	return array($xml, $encoding);

}

function _cleanup_xml($text)
{
	$text = $this->_strings->strip_control($text);
	$text = $this->_strings->strip_tab($text);
	return $text;
}

function _check_known_encoding($enc) 
{
	$enc = strtolower($enc);
	if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) 
	{
		return true;
	}
	return false;
}

//--------------------------------------------------------
// get unixtime from RFC822
//--------------------------------------------------------
function get_unixtime_rfc822( $datetime )
{
	$unixtime = strtotime($datetime);

// maybe undefined time zone
	if ($unixtime == -1)
	{

// delete time zone
		$datetime = preg_replace("/ [a-zA-Z]{3,}$/", '', $datetime);
		$unixtime = strtotime( $datetime );
	}

// give up
	$unixtime = intval($unixtime);
	if ($unixtime < 0)
	{
		$unixtime = 0;
	}

	return $unixtime;
}

// -------------------------------------------------------------------------
// http://www.arielworks.net/articles/2004/0224c/
// array parse_w3cdtf(string datetime)
// -------------------------------------------------------------------------
// http://www.w3.org/TR/NOTE-datetime
//  Year:
//      YYYY (eg 1997)
//   Year and month:
//      YYYY-MM (eg 1997-07)
//   Complete date:
//      YYYY-MM-DD (eg 1997-07-16)
//   Complete date plus hours and minutes:
//      YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
//   Complete date plus hours, minutes and seconds:
//      YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
//   Complete date plus hours, minutes, seconds and a decimal fraction of a second
//      YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
// -------------------------------------------------------------------------
function get_unixtime_w3cdtf($datetime)
{
	$unixtime = 0;
	$arr  = $this->parse_w3cdtf($datetime);

	if ( isset($arr['timestamp']) )
	{
		$unixtime = intval( $arr['timestamp'] );
	}

// give up
	if ($unixtime < 0)
	{
		$unixtime = 0;
	}

	return $unixtime;
}

function parse_w3cdtf($datetime)
{

// 2004-10-24 K.OHWADA
// suppress warning
	$year     = 0;
    $month    = 0;
    $day      = 0;
    $hour     = 0;
    $minute   = 0;
    $second   = 0;
    $fraction = 0;
    $timezone = 0;
	$offset_sign   = 0;
    $offset_hour   = 0;
    $offset_minute = 0;

    // Year
    if(preg_match("/^(\d{4})$/", $datetime, $val)) {
        $year = $val[1];

    // Year and month
    } elseif(preg_match("/^([0-9]{4})-(0[1-9]|1[0-2])$/", $datetime, $val)) {
        $year = $val[1];
        $month = $val[2];

    // Complete date
    } elseif(preg_match("/^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])$/", $datetime, $val)) {
        $year = $val[1];
        $month = $val[2];
        $day = $val[3];

    // Complete date plus hours and minutes
    } elseif(preg_match("/^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])T([0-5][0-9]):([0-5][0-9])(Z|(\+|-)[0-5][0-9]:[0-5][0-9])$/", $datetime, $val)) {
        $year = $val[1];
        $month = $val[2];
        $day = $val[3];
        $hour = $val[4];
        $minute = $val[5];
        $timezone = $val[6];

    // Complete date plus hours, minutes and seconds
    } elseif(preg_match("/^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])T([0-5][0-9]):([0-5][0-9]):([0-5][0-9])(Z|(\+|-)[0-5][0-9]:[0-5][0-9])$/", $datetime, $val)) {
        $year = $val[1];
        $month = $val[2];
        $day = $val[3];
        $hour = $val[4];
        $minute = $val[5];
        $second = $val[6];
        $timezone = $val[7];

    // Complete date plus hours, minutes, seconds and a decimal fraction of a second
    } elseif(preg_match("/^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])T([0-5][0-9]):([0-5][0-9]):([0-5][0-9]).([0-9]+)(Z|(\+|-)[0-5][0-9]:[0-5][0-9])$/", $datetime, $val)) {
        $year = $val[1];
        $month = $val[2];
        $day = $val[3];
        $hour = $val[4];
        $minute = $val[5];
        $second = $val[6];
        $fraction = $val[7];
        $timezone = $val[8];

    // Not W3C-DTF
    } else {
        return false;
    }

    // Offset of Timezone for gmmktime()
    if($timezone != "Z") {
        $offset_sign = substr($timezone, 0, 1);
        $offset_hour = substr($timezone, 1, 2);
        $offset_minute = substr($timezone, 4, 2);
    }

    $timestamp = gmmktime($hour - ($offset_sign . $offset_hour), $minute - ($offset_sign . $offset_minute), $second, $month, $day, $year);

    $result = array("year" => $year, "month" => $month, "day" => $day,
                    "hour" => $hour, "minute" => $minute, "second" => $second,
                    "fraction" => $fraction, "timezone" => $timezone, "timestamp" => $timestamp);

    return $result;
}

//----- class end -----
}

// === class end ===
}

?>