<?php
// $Id: rssc_xml_utility.php,v 1.1.1.1 2006/01/03 05:06:04 ohwada Exp $

//=========================================================
// Rss Center Module
// 2006-01-01 K.OHWADA
//=========================================================

// === class begin ===
if( !class_exists('rssc_xml_utility') ) 
{

//=========================================================
// class rssc_xml_utility
//=========================================================
class rssc_xml_utility extends rssc_error
{
// class instance
	var $_xml_parser;
	var $_remote;
	var $_convert;
	var $_strings;

// basic config
	var $_sel_priority = RSSC_C_SEL_ATOM;

// result
	var $_html_data;
	var $_xml_data;
	var $_xml_mode;
	var $_rdf_url;
	var $_rss_url;
	var $_atom_url;
	var $_xml_encoding_orig;
	var $_xml_kind;
	var $_parsed_data = array();


//---------------------------------------------------------
// constructor
//---------------------------------------------------------
function rssc_xml_utility()
{
	$this->rssc_error();

// class instance
	$this->_xml_parser =& rssc_xml_parser::getInstance();
	$this->_remote     =& rssc_remote_file::getInstance();
	$this->_convert    =& rssc_convert::getInstance();
	$this->_strings    =& rssc_strings::getInstance();

}

function &getInstance()
{
	static $instance;
	if (!isset($instance)) 
	{
		$instance = new rssc_xml_utility();
	}

	return $instance;
}

//=========================================================
// public
//=========================================================

//---------------------------------------------------------
// discover XML url & read XML & parse XML
//---------------------------------------------------------
function discover_parse($html_url)
{
	if ( !$this->discover($html_url) )
	{
		return false;
	}

	$xml_url = $this->get_xmlurl_by_mode();
	$xml_encoding = '';

	if ( !$this->parse_by_url($xml_url, $xml_encoding, $this->_xml_mode) )
	{
		return false;
	}

	return $this->_parsed_data;
}

//---------------------------------------------------------
// discover XML link
//---------------------------------------------------------
function discover($html_url, $sel='')
{
	$this->_set_log_func_name('discover');

	if ( empty($sel) )
	{
		$sel = $this->_sel_priority;
	}

	$html_data = $this->read_html($html_url);
	if ( empty($html_data) )
	{
		return false;
	}

	list($rdf_url, $rss_url, $atom_url) = $this->find_link($html_data, $html_url);

	if ( ($sel == RSSC_C_SEL_ATOM) && $atom_url )
	{
		$xml_mode = RSSC_C_MODE_ATOM;
	}
	elseif ( ($sel == RSSC_C_SEL_RSS) && $rss_url )
	{
		$xml_mode = RSSC_C_MODE_RSS;
	}
	elseif ( ($sel == RSSC_C_SEL_RDF) && $rdf_url )
	{
		$xml_mode = RSSC_C_MODE_RDF;
	}
	elseif ( $atom_url )
	{
		$xml_mode = RSSC_C_MODE_ATOM;
	}
	elseif ( $rss_url )
	{
		$xml_mode = RSSC_C_MODE_RSS;
	}
	elseif ( $rdf_url )
	{
		$xml_mode = RSSC_C_MODE_RDF;
	}
	else
	{
		return false;
	}

	$this->_xml_mode = $xml_mode;
	$this->_rdf_url  = $rdf_url;
	$this->_rss_url  = $rss_url;
	$this->_atom_url = $atom_url;

	return true;
}

//---------------------------------------------------------
// read XML & parse XML
//---------------------------------------------------------
function parse_by_url($xml_url, $xml_encoding='', $xml_mode='')
{
	$xml_data = $this->read_xml($xml_url);
	if ( !$xml_data )
	{
		return false;
	}

	if ( $xml_encoding )
	{
		$encoding_orig = $xml_encoding;
		$this->_xml_encoding_orig = $xml_encoding;
	}
// find encoding, if empty
	else
	{
		$encoding_orig = $this->find_encoding($xml_data);

		if ( !$encoding_orig )
		{
			return false;
		}
	}

	if ( $xml_mode )
	{
		$mode_orig = $xml_mode;
		$this->_xml_mode = $xml_mode;
	}
// find mode, if empty
	else
	{
		$mode_orig = $this->find_kind($xml_data);

		if ( !$mode_orig )
		{
			return false;
		}
	}

	list($xml_converted, $encoding_converted)
		= $this->convert_to_parse($xml_data, $encoding_orig);

	$xml_parsed = $this->parse($xml_converted, $mode_orig, $encoding_converted);

	return true;
}

//---------------------------------------------------------
// read remote HTML
//---------------------------------------------------------
function read_html($url)
{
	$this->_set_log_func_name('read_html');

// read remote XML
	$data = $this->_remote->read_file($url);

	if (!$data)
	{
		$this->_set_errors( "rssc: cannot read html data from $url" );
		$this->_set_errors( $this->_remote->getErrorCode() );
		return false;
	}

	$this->_html_data = $data;
	return $data;
}

//---------------------------------------------------------
// read remote XML
//---------------------------------------------------------
function read_xml($url)
{
	$this->_set_log_func_name('read_xml');

// read remote XML
	$data = $this->_remote->read_file($url);

	if (!$data)
	{
		$this->_set_error_code( RSSC_E_REMOTE_XML );
		$this->_set_errors( "rssc: cannot read xml data from $url" );
		$this->_set_errors( $this->_remote->getErrorCode() );
		return false;
	}

	$this->_xml_data = $data;
	return $data;
}

//---------------------------------------------------------
// find XML link: auto discovery
//---------------------------------------------------------
function find_link($html_data, $html_url='')
{
	$this->_set_log_func_name('find_link');

	list($rdf_url, $rss_url, $atom_url)
		= $this->_find_xml_link($html_data, $html_url);

	if ( empty($rdf_url) && empty($rss_url) && empty($atom_url) )
	{
		$this->_set_errors( 'rssc: cannot find xml link' );
	}

	return array($rdf_url, $rss_url, $atom_url);
}

//---------------------------------------------------------
// find XML encoding
//---------------------------------------------------------
function find_encoding($xml)
{
	$this->_set_log_func_name('find_encoding');

	$encoding = $this->_find_xml_encoding($xml);
	if ( !$encoding )
	{
		$this->_set_errors( "rssc: cannot find xml encoding" );
		return false;
	}

	$encoding_orig = strtolower( $encoding );
	$this->_xml_encoding_orig = $encoding_orig;
	return $encoding_orig;
}

//---------------------------------------------------------
// find XML mode
//---------------------------------------------------------
function find_kind($xml)
{
	$this->_set_log_func_name('find_kind');

	$kind = $this->_find_xml_kind($xml);
	if (!$kind)
	{
		$this->_set_errors( "rssc: cannot find xml kind" );
		return false;
	}

	switch ($kind)
	{
		case 'rdf':
			$mode = RSSC_C_MODE_RDF;
			break;

		case 'rss':
			$mode = RSSC_C_MODE_RSS;
			break;

		case 'atom':
			$mode = RSSC_C_MODE_ATOM;
			break;

		default:
			$this->_set_errors( "rssc: cannot find xml kind" );
			return false;
			break;
	}

	$this->_xml_kind = $kind;
	$this->_xml_mode = $mode;
	return $mode;

}

//---------------------------------------------------------
// convert XML to parse
//---------------------------------------------------------
function convert_to_parse($xml_data, $xml_encoding)
{
	$this->_set_log_func_name('convert_to_parse');
	return $this->_convert_xml_to_parse($xml_data, $xml_encoding);
}

//---------------------------------------------------------
// set param
//---------------------------------------------------------
function set_priority($value)
{
	$this->_sel_priority = $value;
}

function set_encoding_local($value)
{
	$this->_xml_encoding_local = $value;
}

//---------------------------------------------------------
// get result of auto discovery
//---------------------------------------------------------
function get_html()
{
	return $this->_html_data;
}

function get_xml_mode()
{
	return $this->_xml_mode;
}

function get_rdf_url()
{
	return $this->_rdf_url;
}

function get_rss_url()
{
	return $this->_rss_url;
}

function get_atom_url()
{
	return $this->_atom_url;
}

function get_xmlurl_by_mode()
{
	switch ( $this->_xml_mode )
	{
		case RSSC_C_MODE_RDF:
			return $this->_rdf_url;
			break;

		case RSSC_C_MODE_RSS:
			return $this->_rss_url;
			break;

		case RSSC_C_MODE_ATOM:
			return $this->_atom_url;
			break;
	}

	return false;
}

//---------------------------------------------------------
// get result of parse
//---------------------------------------------------------
function get_xml()
{
	return $this->_xml_data;
}

function get_xml_encoding()
{
	return $this->_xml_encoding_orig;
}

function get_xml_kind()
{
	return $this->_xml_kind;
}

function get_data()
{
	return $this->_parsed_data;
}

function get_channel_var($key)
{
	if ( isset($this->_parsed_data['channel'][$key]) )
	{
		return $this->_parsed_data['channel'][$key];
	}

	return false;
}

//=========================================================
// override
//=========================================================
function set_debug_print_log($value)
{
	$value = intval($value);
	$this->_flag_debug_print_log = $value;
	$this->_xml_parser->set_debug_print_log($value);
}

function set_debug_print_error($value)
{
	$value = intval($value);
	$this->_flag_debug_print_error = $value;
	$this->_xml_parser->set_debug_print_error($value);
}

//=========================================================
// private
//=========================================================

//---------------------------------------------------------
// find RDF/RSS/ATOM link in HTML
//---------------------------------------------------------
// <link rel="alternate" type="application/rdf+xml"  title="RDF" href="xxx" /> 
// <link rel="alternate" type="application/rss+xml"  title="RSS" href="xxx" /> 
// <link rel="alternate" type="application/atom+xml" title="ATOM" href="xxx" /> 
//---------------------------------------------------------
function _find_xml_link($html, $url='')
{
	$href_rdf  = '';
	$href_rss  = '';
	$href_atom = '';

// save all <link> tags
	preg_match_all('/<link\s+(.*?)\s*\/?>/si', $html, $match);
	$link_tag_arr = $match[1];

	$link_arr = array();
	$link_tag_count = count($link_tag_arr);

// store each <link> tags's attributes
	for($i=0; $i<$link_tag_count; $i++)
	{
		$attr_wk_arr   = array();
		$link_attr_arr = preg_split('/\s+/s', $link_tag_arr[$i]);

		foreach($link_attr_arr as $link_attr)
		{
			$link_attr_pair = preg_split('/\s*=\s*/s', $link_attr, 2);

			if( isset($link_attr_pair[0]) && isset($link_attr_pair[1]) )
			{
				$key   = $link_attr_pair[0];
				$value = $link_attr_pair[1];
				$key   = strtolower( $key );
				$value = preg_replace('/([\'"]?)(.*)\1/', '$2', $value);
				$attr_wk_arr[$key] = $value;
			}
		}

		$link_arr[$i] = $attr_wk_arr;
	}

// find the link file
	for($i=0; $i<$link_tag_count; $i++)
	{
		if ( !isset($link_arr[$i]['rel']) )   continue;
		if ( !isset($link_arr[$i]['type']) )  continue;
		if ( !isset($link_arr[$i]['href']) )  continue;

		$rel  = strtolower( $link_arr[$i]['rel'] );
		$type = strtolower( $link_arr[$i]['type'] );
		$href = $link_arr[$i]['href'];

		if ( $rel != 'alternate')  continue;

		if (empty($href_rdf) && ($type == 'application/rdf+xml'))
		{
			$href_rss = $href;
		}
		elseif (empty($href_rss) && ($type == 'application/rss+xml'))
		{
			$href_rss = $href;
		}
		elseif (empty($href_atom) && ($type == 'application/atom+xml'))
		{
			$href_atom = $href;
		}
	}

	if ($url)
	{
		$href_rdf  = $this->_relative_to_full_url($href_rdf,  $url);
		$href_rss  = $this->_relative_to_full_url($href_rss,  $url);
		$href_atom = $this->_relative_to_full_url($href_atom, $url);
	}

	return array($href_rdf, $href_rss, $href_atom);
}

//---------------------------------------------------------
// relative_to_full_url
//---------------------------------------------------------
function _relative_to_full_url($url, $url_html)
{
	if ( empty($url) )  return '';

// start from "/"
	if ( ereg("^\/", $url) ) 
	{
		$domain = '';

	// "http://domain/***/"
		if ( preg_match("/http:\/\/(.*?)\/.*/", $url_html, $match) ) 
		{
			$domain   = $match[1];
		}

		$url_full = "http://".$domain.$url;
	}
// not start from "http"
	elseif ( !ereg("^http", $url) ) 
	{
		$dir = $url_html;

	// "dir/***/"
		if ( preg_match("/^(.*)\/(.*\..*)$/", $dir, $match) )
		{
			$dir = $match[1];
		}

		$url_full = $dir."/".$url;
	}
// maybe full url
	else
	{
		$url_full = $url;
	}

	return $url_full;
}

//---------------------------------------------------------
// find XML encoding
// < ? xml version="1.0" encoding="UTF-8" ? >
//---------------------------------------------------------
function _find_xml_encoding($text)
{
	if ( preg_match('/<\?xml(.*?)\?>/si', $text, $match1) )
	{
		$line = $match1[1];

		if ( preg_match('/encoding="(.*?)"/si', $line, $match2) )
		{
			return $match2[1];
		}
	}

	return false;
}

//---------------------------------------------------------
// find XML mode
// < rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" >
// < rss version="2.0" >
// < feed version="0.3" xmlns="http://purl.org/atom/ns#" >
// < feed xmlns="http://www.w3.org/2005/Atom" >
//---------------------------------------------------------
function _find_xml_kind($text)
{
	if ( preg_match('/<rdf:RDF(.*?)>/si', $text) )
	{
		return 'rdf';
	}

	if ( preg_match('/<rss(.*?)>/si', $text) )
	{
		return 'rss';
	}

	if ( preg_match('/<feed(.*?)>/si', $text, $match1) )
	{
		$line = $match1[1];

		if ( preg_match('/atom/si', $line) )
		{
			return 'atom';
		}
	}

	return false;
}

//---------------------------------------------------------
// convert xml to parse
//---------------------------------------------------------
function _convert_xml_to_parse($xml, $encoding)
{

// not convert, if PHP default
	if ( ($encoding == 'utf-8') || ($encoding == 'us-ascii') || ($encoding == 'iso-8859-1') )
	{
		$xml_cleaned = $this->_cleanup_xml( $xml );
		return array($xml_cleaned, $encoding);
	}

// convert
	elseif ($encoding)
	{
		$encoding_converted = 'utf-8';
		$xml_converted = $this->_convert->convert($xml, $encoding_converted, $encoding);
		$xml_cleaned = $this->_cleanup_xml( $xml_converted );
		return array($xml_cleaned, $encoding_converted);
	}

// no action
	return array($xml, $encoding);

}

function _cleanup_xml($text)
{
	return $this->_strings->strip_control_code($text);
}

//=========================================================
// use xml parser
//=========================================================
function parse($xml_data, $xml_mode, $xml_encoding)
{
	$parsed_data = $this->_xml_parser->parse($xml_data, $xml_mode, $xml_encoding);
	$this->_parsed_data = $parsed_data;
	return $parsed_data;
}

function set_rss_parser($value)
{
	$this->_xml_parser->set_rss_parser($value);
}

function set_atom_parser($value)
{
	$this->_xml_parser->set_atom_parser($value);
}

function set_local_encoding($value)
{
	$this->_xml_parser->set_local_encoding($value);
}

//----- class end -----
}

// === class end ===
}

?>