各位朋友大家好,今天给大家带来的是
php爬虫采集类-phpQuery:支持抓取网站,非常强大的php类库!
它是一款基于PHP服务端开源的项目,可以让PHPer轻松处理DOM文档内容,比如可以获取某网站的头条信息。更有意思的是,它采用了jQuery的思想,可以像使用jQuery一样处理页面内容,获取你想要的页面信息。
由于源码比较长,有想要文件版源码的同学可以来我的PHP学习交流裙: 157531900 每天都会上传一些类库,技术分享!欢迎各路小白和大神的加入!
好了,废话不多说,上源码!
<?php
/**
* phpQuery is a server-side, chainable, CSS3 selector driven
* Document Object Model (DOM) API based on jQuery JavaScript Library.
*
* @version 0.9.5
* @link http://code.google.com/p/phpquery/
* @link http://phpquery-library.blogspot.com/
* @link http://jquery.com/
* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>
* @license http://www.opensource.org/licenses/mit-license.php MIT License
* @package phpQuery
*/
// class names for instanceof
// TODO move them as class constants into phpQuery
define(DOMDOCUMENT, DOMDocument);
define(DOMELEMENT, DOMElement);
define(DOMNODELIST, DOMNodeList);
define(DOMNODE, DOMNode);
/**
* DOMEvent class.
*
* Based on
* @link http://developer.mozilla.org/En/DOM:event
* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>
* @package phpQuery
* @todo implement ArrayAccess ?
*/
class DOMEvent {
/**
* Returns a boolean indicating whether the event bubbles up through the DOM or not.
*
* @var unknown_type
*/
public $bubbles = true;
/**
* Returns a boolean indicating whether the event is cancelable.
*
* @var unknown_type
*/
public $cancelable = true;
/**
* Returns a reference to the currently registered target for the event.
*
* @var unknown_type
*/
public $currentTarget;
/**
* Returns detail about the event, depending on the type of event.
*
* @var unknown_type
* @link http://developer.mozilla.org/en/DOM/event.detail
*/
public $detail; // ???
/**
* Used to indicate which phase of the event flow is currently being evaluated.
*
* NOT IMPLEMENTED
*
* @var unknown_type
* @link http://developer.mozilla.org/en/DOM/event.eventPhase
*/
public $eventPhase; // ???
/**
* The explicit original target of the event (Mozilla-specific).
*
* NOT IMPLEMENTED
*
* @var unknown_type
*/
public $explicitOriginalTarget; // moz only
/**
* The original target of the event, before any retargetings (Mozilla-specific).
*
* NOT IMPLEMENTED
*
* @var unknown_type
*/
public $originalTarget; // moz only
/**
* Identifies a secondary target for the event.
*
* @var unknown_type
*/
public $relatedTarget;
/**
* Returns a reference to the target to which the event was originally dispatched.
*
* @var unknown_type
*/
public $target;
/**
* Returns the time that the event was created.
*
* @var unknown_type
*/
public $timeStamp;
/**
* Returns the name of the event (case-insensitive).
*/
public $type;
public $runDefault = true;
public $data = null;
public function __construct($data) {
foreach($data as $k => $v) {
$this->$k = $v;
}
if (! $this->timeStamp)
$this->timeStamp = time();
}
/**
* Cancels the event (if it is cancelable).
*
*/
public function preventDefault() {
$this->runDefault = false;
}
/**
* Stops the propagation of events further along in the DOM.
*
*/
public function stopPropagation() {
$this->bubbles = false;
}
}
/**
* DOMDocumentWrapper class simplifies work with DOMDocument.
*
* Know bug:
* – in XHTML fragments, <br /> changes to <br clear=”none” />
*
* @todo check XML catalogs compatibility
* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>
* @package phpQuery
*/
class DOMDocumentWrapper {
/**
* @var DOMDocument
*/
public $document;
public $id;
/**
* @todo Rewrite as method and quess if null.
* @var unknown_type
*/
public $contentType = ;
public $xpath;
public $uuid = 0;
public $data = array();
public $dataNodes = array();
public $events = array();
public $eventsNodes = array();
public $eventsGlobal = array();
/**
* @TODO iframes support http://code.google.com/p/phpquery/issues/detail?id=28
* @var unknown_type
*/
public $frames = array();
/**
* Document root, by default equals to document itself.
* Used by documentFragments.
*
* @var DOMNode
*/
public $root;
public $isDocumentFragment;
public $isXML = false;
public $isXHTML = false;
public $isHTML = false;
public $charset;
public function __construct($markup = null, $contentType = null, $newDocumentID = null) {
if (isset($markup))
$this->load($markup, $contentType, $newDocumentID);
$this->id = $newDocumentID
? $newDocumentID
: md5(microtime());
}
public function load($markup, $contentType = null, $newDocumentID = null) {
// phpQuery::$documents[$id] = $this;
$this->contentType = strtolower($contentType);
if ($markup instanceof DOMDOCUMENT) {
$this->document = $markup;
$this->root = $this->document;
$this->charset = $this->document->encoding;
// TODO isDocumentFragment
$loaded = true;
} else {
$loaded = $this->loadMarkup($markup);
}
if ($loaded) {
// $this->document->formatOutput = true;
$this->document->preserveWhiteSpace = true;
$this->xpath = new DOMXPath($this->document);
$this->afterMarkupLoad();
return true;
// remember last loaded document
// return phpQuery::selectDocument($id);
}
return false;
}
protected function afterMarkupLoad() {
if ($this->isXHTML) {
$this->xpath->registerNamespace(“html”, “http://www.w3.org/1999/xhtml”);
}
}
protected function loadMarkup($markup) {
$loaded = false;
if ($this->contentType) {
self::debug(“Load markup for content type {$this->contentType}”);
// content determined by contentType
list($contentType, $charset) = $this->contentTypeToArray($this->contentType);
switch($contentType) {
case text/html:
phpQuery::debug(“Loading HTML, content type {$this->contentType}”);
$loaded = $this->loadMarkupHTML($markup, $charset);
break;
case text/xml:
case application/xhtml+xml:
phpQuery::debug(“Loading XML, content type {$this->contentType}”);
$loaded = $this->loadMarkupXML($markup, $charset);
break;
default:
// for feeds or anything that sometimes doesnt use text/xml
if (strpos(xml, $this->contentType) !== false) {
phpQuery::debug(“Loading XML, content type {$this->contentType}”);
$loaded = $this->loadMarkupXML($markup, $charset);
} else
phpQuery::debug(“Could not determine document type from content type {$this->contentType}”);
}
} else {
// content type autodetection
if ($this->isXML($markup)) {
phpQuery::debug(“Loading XML, isXML() == true”);
$loaded = $this->loadMarkupXML($markup);
if (! $loaded && $this->isXHTML) {
phpQuery::debug(Loading as XML failed, trying to load as HTML, isXHTML == true);
$loaded = $this->loadMarkupHTML($markup);
}
} else {
phpQuery::debug(“Loading HTML, isXML() == false”);
$loaded = $this->loadMarkupHTML($markup);
}
}
return $loaded;
}
protected function loadMarkupReset() {
$this->isXML = $this->isXHTML = $this->isHTML = false;
}
protected function documentCreate($charset, $version = 1.0) {
if (! $version)
$version = 1.0;
$this->document = new DOMDocument($version, $charset);
$this->charset = $this->document->encoding;
// $this->document->encoding = $charset;
$this->document->formatOutput = true;
$this->document->preserveWhiteSpace = true;
}
protected function loadMarkupHTML($markup, $requestedCharset = null) {
if (phpQuery::$debug)
phpQuery::debug(Full markup load (HTML): .substr($markup, 0, 250));
$this->loadMarkupReset();
$this->isHTML = true;
if (!isset($this->isDocumentFragment))
$this->isDocumentFragment = self::isDocumentFragmentHTML($markup);
$charset = null;
$documentCharset = $this->charsetFromHTML($markup);
$addDocumentCharset = false;
if ($documentCharset) {
$charset = $documentCharset;
$markup = $this->charsetFixHTML($markup);
} else if ($requestedCharset) {
$charset = $requestedCharset;
}
if (! $charset)
$charset = phpQuery::$defaultCharset;
// HTTP 1.1 says that the default charset is ISO-8859-1
// @see http://www.w3.org/International/O-HTTP-charset
if (! $documentCharset) {
$documentCharset = ISO-8859-1;
$addDocumentCharset = true;
}
// Should be careful here, still need magic encoding detection since lots of pages have other default encoding
// Worse, some pages can have mixed encodings… well try not to worry about that
$requestedCharset = strtoupper($requestedCharset);
$documentCharset = strtoupper($documentCharset);
phpQuery::debug(“DOC: $documentCharset REQ: $requestedCharset”);
if ($requestedCharset && $documentCharset && $requestedCharset !== $documentCharset) {
phpQuery::debug(“CHARSET CONVERT”);
// Document Encoding Conversion
// http://code.google.com/p/phpquery/issues/detail?id=86
if (function_exists(mb_detect_encoding)) {
$possibleCharsets = array($documentCharset, $requestedCharset, AUTO);
$docEncoding = mb_detect_encoding($markup, implode(, , $possibleCharsets));
if (! $docEncoding)
$docEncoding = $documentCharset; // ok trust the document
phpQuery::debug(“DETECTED $docEncoding”);
// Detected does not match what document says…
if ($docEncoding !== $documentCharset) {
// Tricky..
}
if ($docEncoding !== $requestedCharset) {
phpQuery::debug(“CONVERT $docEncoding => $requestedCharset”);
$markup = mb_convert_encoding($markup, $requestedCharset, $docEncoding);
$markup = $this->charsetAppendToHTML($markup, $requestedCharset);
$charset = $requestedCharset;
}
} else {
phpQuery::debug(“TODO: charset conversion without mbstring…”);
}
}
$return = false;
if ($this->isDocumentFragment) {
phpQuery::debug(“Full markup load (HTML), DocumentFragment detected, using charset $charset”);
$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
} else {
if ($addDocumentCharset) {
phpQuery::debug(“Full markup load (HTML), appending charset: $charset”);
$markup = $this->charsetAppendToHTML($markup, $charset);
}
phpQuery::debug(“Full markup load (HTML), documentCreate($charset)”);
$this->documentCreate($charset);
$return = phpQuery::$debug === 2
? $this->document->loadHTML($markup)
: @$this->document->loadHTML($markup);
if ($return)
$this->root = $this->document;
}
if ($return && ! $this->contentType)
$this->contentType = text/html;
return $return;
}
protected function loadMarkupXML($markup, $requestedCharset = null) {
if (phpQuery::$debug)
phpQuery::debug(Full markup load (XML): .substr($markup, 0, 250));
$this->loadMarkupReset();
$this->isXML = true;
// check agains XHTML in contentType or markup
$isContentTypeXHTML = $this->isXHTML();
$isMarkupXHTML = $this->isXHTML($markup);
if ($isContentTypeXHTML || $isMarkupXHTML) {
self::debug(Full markup load (XML), XHTML detected);
$this->isXHTML = true;
}
// determine document fragment
if (! isset($this->isDocumentFragment))
$this->isDocumentFragment = $this->isXHTML
? self::isDocumentFragmentXHTML($markup)
: self::isDocumentFragmentXML($markup);
// this charset will be used
$charset = null;
// charset from XML declaration @var string
$documentCharset = $this->charsetFromXML($markup);
if (! $documentCharset) {
if ($this->isXHTML) {
// this is XHTML, try to get charset from content-type meta header
$documentCharset = $this->charsetFromHTML($markup);
if ($documentCharset) {
phpQuery::debug(“Full markup load (XML), appending XHTML charset $documentCharset”);
$this->charsetAppendToXML($markup, $documentCharset);
$charset = $documentCharset;
}
}
if (! $documentCharset) {
// if still no document charset…
$charset = $requestedCharset;
}
} else if ($requestedCharset) {
$charset = $requestedCharset;
}
if (! $charset) {
$charset = phpQuery::$defaultCharset;
}
if ($requestedCharset && $documentCharset && $requestedCharset != $documentCharset) {
// TODO place for charset conversion
// $charset = $requestedCharset;
}
$return = false;
if ($this->isDocumentFragment) {
phpQuery::debug(“Full markup load (XML), DocumentFragment detected, using charset $charset”);
$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
} else {
// FIXME ???
if ($isContentTypeXHTML && ! $isMarkupXHTML)
if (! $documentCharset) {
phpQuery::debug(“Full markup load (XML), appending charset $charset”);
$markup = $this->charsetAppendToXML($markup, $charset);
}
// see http://pl2.php.net/manual/en/book.dom.php#78929
// LIBXML_DTDLOAD (>= PHP 5.1)
// does XML ctalogues works with LIBXML_NONET
// $this->document->resolveExternals = true;
// TODO test LIBXML_COMPACT for performance improvement
// create document
$this->documentCreate($charset);
if (phpversion() < 5.1) {
$this->document->resolveExternals = true;
$return = phpQuery::$debug === 2
? $this->document->loadXML($markup)
: @$this->document->loadXML($markup);
} else {
/** @link http://pl2.php.net/manual/en/libxml.constants.php */
$libxmlStatic = phpQuery::$debug === 2
? LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET
: LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET|LIBXML_NOWARNING|LIBXML_NOERROR;
$return = $this->document->loadXML($markup, $libxmlStatic);
// if (! $return)
// $return = $this->document->loadHTML($markup);
}
if ($return)
$this->root = $this->document;
}