PHP原生简单爬虫代码-花花博客

早就知道PHP可以做爬虫,但是一直没做过,旁边的py小哥天天爬的不易乐乎,就想写个简单的爬虫玩玩,单线程的速度确实是慢,运行了大概两个小时,才爬到了4500张图片,这个效率感人

<?php class Cow{ public $url = http://xxxxx.com; //就是举个例子,这个要爬的网站就不提供了 public $img = /<img src=”(.*?jpg)”/; public $href = /(\/arc\/.*?)”/; public $boxes = []; public function run($url){ ini_set(memory_limit, 2048M); //设置使用内存 set_time_limit(0); //脚本超时时间设置为0 $url = rtrim($url,/); //清洗url $content = file_get_contents($url); //获取url内容 $nname = ./xxx/.strrchr($url,/)..txt; file_put_contents($nname,$content); $fp = fopen($nname, “r”) or die(“Unable to open file!”); do{ $line = fgets($fp); //循环一行一行的读取 //根据正则匹配需要的字符串 ,保存图片 if(preg_match($this->img,$line,$matchs)){ $url = str_replace(https,http,$matchs[1]); $img = file_get_contents($url); file_put_contents(./npyimg/.$this->str_random(5)..jpg,$img); echo $url.PHP_EOL; } //如果匹配到下一页的字符串, 启用递归 if(preg_match($this->href,$line,$matchs)){ $url = $this->url.$matchs[1]; if(!in_array($url,$this->boxes)){ echo $url.PHP_EOL; $this->boxes[] = $url; $this->run($url); } } }while(!feof($fp)); echo the end.PHP_EOL; } //生成随机数 private function str_random($len){ $len = $len/2; $bytes = random_bytes($len); return bin2hex($bytes); } } $cow = new Cow(); $cow->run($cow->url); ?>

文章版权归作者所有，未经允许请勿转载。

THE END

作文百科