php爬虫采集技术,轻松抓取网站!含源码

各位朋友大家好,今天给大家带来的是

php爬虫采集类-phpQuery:支持抓取网站,非常强大的php类库!

它是一款基于PHP服务端开源的项目,可以让PHPer轻松处理DOM文档内容,比如可以获取某网站的头条信息。更有意思的是,它采用了jQuery的思想,可以像使用jQuery一样处理页面内容,获取你想要的页面信息。

由于源码比较长,有想要文件版源码的同学可以来我的PHP学习交流裙: 157531900 每天都会上传一些类库,技术分享!欢迎各路小白和大神的加入!

好了,废话不多说,上源码!

<?php

/**

* phpQuery is a server-side, chainable, CSS3 selector driven

* Document Object Model (DOM) API based on jQuery JavaScript Library.

*

* @version 0.9.5

* @link http://code.google.com/p/phpquery/

* @link http://phpquery-library.blogspot.com/

* @link http://jquery.com/

* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>

* @license http://www.opensource.org/licenses/mit-license.php MIT License

* @package phpQuery

*/

// class names for instanceof

// TODO move them as class constants into phpQuery

define(DOMDOCUMENT, DOMDocument);

define(DOMELEMENT, DOMElement);

define(DOMNODELIST, DOMNodeList);

define(DOMNODE, DOMNode);

/**

* DOMEvent class.

*

* Based on

* @link http://developer.mozilla.org/En/DOM:event

* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>

* @package phpQuery

* @todo implement ArrayAccess ?

*/

class DOMEvent {

/**

* Returns a boolean indicating whether the event bubbles up through the DOM or not.

*

* @var unknown_type

*/

public $bubbles = true;

/**

* Returns a boolean indicating whether the event is cancelable.

*

* @var unknown_type

*/

public $cancelable = true;

/**

* Returns a reference to the currently registered target for the event.

*

* @var unknown_type

*/

public $currentTarget;

/**

* Returns detail about the event, depending on the type of event.

*

* @var unknown_type

* @link http://developer.mozilla.org/en/DOM/event.detail

*/

public $detail; // ???

/**

* Used to indicate which phase of the event flow is currently being evaluated.

*

* NOT IMPLEMENTED

*

* @var unknown_type

* @link http://developer.mozilla.org/en/DOM/event.eventPhase

*/

public $eventPhase; // ???

/**

* The explicit original target of the event (Mozilla-specific).

*

* NOT IMPLEMENTED

*

* @var unknown_type

*/

public $explicitOriginalTarget; // moz only

/**

* The original target of the event, before any retargetings (Mozilla-specific).

*

* NOT IMPLEMENTED

*

* @var unknown_type

*/

public $originalTarget; // moz only

/**

* Identifies a secondary target for the event.

*

* @var unknown_type

*/

public $relatedTarget;

/**

* Returns a reference to the target to which the event was originally dispatched.

*

* @var unknown_type

*/

public $target;

/**

* Returns the time that the event was created.

*

* @var unknown_type

*/

public $timeStamp;

/**

* Returns the name of the event (case-insensitive).

*/

public $type;

public $runDefault = true;

public $data = null;

public function __construct($data) {

foreach($data as $k => $v) {

$this->$k = $v;

}

if (! $this->timeStamp)

$this->timeStamp = time();

}

/**

* Cancels the event (if it is cancelable).

*

*/

public function preventDefault() {

$this->runDefault = false;

}

/**

* Stops the propagation of events further along in the DOM.

*

*/

public function stopPropagation() {

$this->bubbles = false;

}

}

/**

* DOMDocumentWrapper class simplifies work with DOMDocument.

*

* Know bug:

* – in XHTML fragments, <br /> changes to <br clear=”none” />

*

* @todo check XML catalogs compatibility

* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>

* @package phpQuery

*/

class DOMDocumentWrapper {

/**

* @var DOMDocument

*/

public $document;

public $id;

/**

* @todo Rewrite as method and quess if null.

* @var unknown_type

*/

public $contentType = ;

public $xpath;

public $uuid = 0;

public $data = array();

public $dataNodes = array();

public $events = array();

public $eventsNodes = array();

public $eventsGlobal = array();

/**

* @TODO iframes support http://code.google.com/p/phpquery/issues/detail?id=28

* @var unknown_type

*/

public $frames = array();

/**

* Document root, by default equals to document itself.

* Used by documentFragments.

*

* @var DOMNode

*/

public $root;

public $isDocumentFragment;

public $isXML = false;

public $isXHTML = false;

public $isHTML = false;

public $charset;

public function __construct($markup = null, $contentType = null, $newDocumentID = null) {

if (isset($markup))

$this->load($markup, $contentType, $newDocumentID);

$this->id = $newDocumentID

? $newDocumentID

: md5(microtime());

}

public function load($markup, $contentType = null, $newDocumentID = null) {

// phpQuery::$documents[$id] = $this;

$this->contentType = strtolower($contentType);

if ($markup instanceof DOMDOCUMENT) {

$this->document = $markup;

$this->root = $this->document;

$this->charset = $this->document->encoding;

// TODO isDocumentFragment

$loaded = true;

} else {

$loaded = $this->loadMarkup($markup);

}

if ($loaded) {

// $this->document->formatOutput = true;

$this->document->preserveWhiteSpace = true;

$this->xpath = new DOMXPath($this->document);

$this->afterMarkupLoad();

return true;

// remember last loaded document

// return phpQuery::selectDocument($id);

}

return false;

}

protected function afterMarkupLoad() {

if ($this->isXHTML) {

$this->xpath->registerNamespace(“html”, “http://www.w3.org/1999/xhtml”);

}

}

protected function loadMarkup($markup) {

$loaded = false;

if ($this->contentType) {

self::debug(“Load markup for content type {$this->contentType}”);

// content determined by contentType

list($contentType, $charset) = $this->contentTypeToArray($this->contentType);

switch($contentType) {

case text/html:

phpQuery::debug(“Loading HTML, content type {$this->contentType}”);

$loaded = $this->loadMarkupHTML($markup, $charset);

break;

case text/xml:

case application/xhtml+xml:

phpQuery::debug(“Loading XML, content type {$this->contentType}”);

$loaded = $this->loadMarkupXML($markup, $charset);

break;

default:

// for feeds or anything that sometimes doesnt use text/xml

if (strpos(xml, $this->contentType) !== false) {

phpQuery::debug(“Loading XML, content type {$this->contentType}”);

$loaded = $this->loadMarkupXML($markup, $charset);

} else

phpQuery::debug(“Could not determine document type from content type {$this->contentType}”);

}

} else {

// content type autodetection

if ($this->isXML($markup)) {

phpQuery::debug(“Loading XML, isXML() == true”);

$loaded = $this->loadMarkupXML($markup);

if (! $loaded && $this->isXHTML) {

phpQuery::debug(Loading as XML failed, trying to load as HTML, isXHTML == true);

$loaded = $this->loadMarkupHTML($markup);

}

} else {

phpQuery::debug(“Loading HTML, isXML() == false”);

$loaded = $this->loadMarkupHTML($markup);

}

}

return $loaded;

}

protected function loadMarkupReset() {

$this->isXML = $this->isXHTML = $this->isHTML = false;

}

protected function documentCreate($charset, $version = 1.0) {

if (! $version)

$version = 1.0;

$this->document = new DOMDocument($version, $charset);

$this->charset = $this->document->encoding;

// $this->document->encoding = $charset;

$this->document->formatOutput = true;

$this->document->preserveWhiteSpace = true;

}

protected function loadMarkupHTML($markup, $requestedCharset = null) {

if (phpQuery::$debug)

phpQuery::debug(Full markup load (HTML): .substr($markup, 0, 250));

$this->loadMarkupReset();

$this->isHTML = true;

if (!isset($this->isDocumentFragment))

$this->isDocumentFragment = self::isDocumentFragmentHTML($markup);

$charset = null;

$documentCharset = $this->charsetFromHTML($markup);

$addDocumentCharset = false;

if ($documentCharset) {

$charset = $documentCharset;

$markup = $this->charsetFixHTML($markup);

} else if ($requestedCharset) {

$charset = $requestedCharset;

}

if (! $charset)

$charset = phpQuery::$defaultCharset;

// HTTP 1.1 says that the default charset is ISO-8859-1

// @see http://www.w3.org/International/O-HTTP-charset

if (! $documentCharset) {

$documentCharset = ISO-8859-1;

$addDocumentCharset = true;

}

// Should be careful here, still need magic encoding detection since lots of pages have other default encoding

// Worse, some pages can have mixed encodings… well try not to worry about that

$requestedCharset = strtoupper($requestedCharset);

$documentCharset = strtoupper($documentCharset);

phpQuery::debug(“DOC: $documentCharset REQ: $requestedCharset”);

if ($requestedCharset && $documentCharset && $requestedCharset !== $documentCharset) {

phpQuery::debug(“CHARSET CONVERT”);

// Document Encoding Conversion

// http://code.google.com/p/phpquery/issues/detail?id=86

if (function_exists(mb_detect_encoding)) {

$possibleCharsets = array($documentCharset, $requestedCharset, AUTO);

$docEncoding = mb_detect_encoding($markup, implode(, , $possibleCharsets));

if (! $docEncoding)

$docEncoding = $documentCharset; // ok trust the document

phpQuery::debug(“DETECTED $docEncoding”);

// Detected does not match what document says…

if ($docEncoding !== $documentCharset) {

// Tricky..

}

if ($docEncoding !== $requestedCharset) {

phpQuery::debug(“CONVERT $docEncoding => $requestedCharset”);

$markup = mb_convert_encoding($markup, $requestedCharset, $docEncoding);

$markup = $this->charsetAppendToHTML($markup, $requestedCharset);

$charset = $requestedCharset;

}

} else {

phpQuery::debug(“TODO: charset conversion without mbstring…”);

}

}

$return = false;

if ($this->isDocumentFragment) {

phpQuery::debug(“Full markup load (HTML), DocumentFragment detected, using charset $charset”);

$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);

} else {

if ($addDocumentCharset) {

phpQuery::debug(“Full markup load (HTML), appending charset: $charset”);

$markup = $this->charsetAppendToHTML($markup, $charset);

}

phpQuery::debug(“Full markup load (HTML), documentCreate($charset)”);

$this->documentCreate($charset);

$return = phpQuery::$debug === 2

? $this->document->loadHTML($markup)

: @$this->document->loadHTML($markup);

if ($return)

$this->root = $this->document;

}

if ($return && ! $this->contentType)

$this->contentType = text/html;

return $return;

}

protected function loadMarkupXML($markup, $requestedCharset = null) {

if (phpQuery::$debug)

phpQuery::debug(Full markup load (XML): .substr($markup, 0, 250));

$this->loadMarkupReset();

$this->isXML = true;

// check agains XHTML in contentType or markup

$isContentTypeXHTML = $this->isXHTML();

$isMarkupXHTML = $this->isXHTML($markup);

if ($isContentTypeXHTML || $isMarkupXHTML) {

self::debug(Full markup load (XML), XHTML detected);

$this->isXHTML = true;

}

// determine document fragment

if (! isset($this->isDocumentFragment))

$this->isDocumentFragment = $this->isXHTML

? self::isDocumentFragmentXHTML($markup)

: self::isDocumentFragmentXML($markup);

// this charset will be used

$charset = null;

// charset from XML declaration @var string

$documentCharset = $this->charsetFromXML($markup);

if (! $documentCharset) {

if ($this->isXHTML) {

// this is XHTML, try to get charset from content-type meta header

$documentCharset = $this->charsetFromHTML($markup);

if ($documentCharset) {

phpQuery::debug(“Full markup load (XML), appending XHTML charset $documentCharset”);

$this->charsetAppendToXML($markup, $documentCharset);

$charset = $documentCharset;

}

}

if (! $documentCharset) {

// if still no document charset…

$charset = $requestedCharset;

}

} else if ($requestedCharset) {

$charset = $requestedCharset;

}

if (! $charset) {

$charset = phpQuery::$defaultCharset;

}

if ($requestedCharset && $documentCharset && $requestedCharset != $documentCharset) {

// TODO place for charset conversion

// $charset = $requestedCharset;

}

$return = false;

if ($this->isDocumentFragment) {

phpQuery::debug(“Full markup load (XML), DocumentFragment detected, using charset $charset”);

$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);

} else {

// FIXME ???

if ($isContentTypeXHTML && ! $isMarkupXHTML)

if (! $documentCharset) {

phpQuery::debug(“Full markup load (XML), appending charset $charset”);

$markup = $this->charsetAppendToXML($markup, $charset);

}

// see http://pl2.php.net/manual/en/book.dom.php#78929

// LIBXML_DTDLOAD (>= PHP 5.1)

// does XML ctalogues works with LIBXML_NONET

// $this->document->resolveExternals = true;

// TODO test LIBXML_COMPACT for performance improvement

// create document

$this->documentCreate($charset);

if (phpversion() < 5.1) {

$this->document->resolveExternals = true;

$return = phpQuery::$debug === 2

? $this->document->loadXML($markup)

: @$this->document->loadXML($markup);

} else {

/** @link http://pl2.php.net/manual/en/libxml.constants.php */

$libxmlStatic = phpQuery::$debug === 2

? LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET

: LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET|LIBXML_NOWARNING|LIBXML_NOERROR;

$return = $this->document->loadXML($markup, $libxmlStatic);

// if (! $return)

// $return = $this->document->loadHTML($markup);

}

if ($return)

$this->root = $this->document;

}

举报/反馈

© 版权声明
THE END
喜欢就支持一下吧
点赞6 分享
评论 抢沙发
头像
欢迎您留下宝贵的见解!
提交
头像

昵称

取消
昵称表情代码图片