我如何在PHP中创build一个简单的爬虫？

我有一个链接链接的网页。我想编写一个脚本，将所有包含在这些链接中的数据转储到本地文件中。

有没有人用PHP做到这一点？一般的指导方针和疑难解答就足以作为答案。

咩。不要用正则expression式parsingHTML 。

这是一个受到Tatu的启发的DOM版本：

<?php function crawl_page($url, $depth = 5) { static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } $seen[$url] = true; $dom = new DOMDocument('1.0'); @$dom->loadHTMLFile($url); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL; } crawl_page("http://hobodave.com", 2);

编辑：我修复了Tatu的版本（现在使用相对URL）的一些错误。

编辑：我添加了一个新的function，防止它遵循相同的URL两次。

编辑：现在回显输出到STDOUT，所以你可以redirect到任何你想要的文件

编辑：修正了乔治在答案中指出的一个错误。相对URL将不再追加到URLpath的末尾，而是覆盖它。感谢乔治这个。请注意，George的答案不包括以下任何一个：https，用户，密码或端口。如果你已经加载了http PECL扩展，那么使用http_build_url可以完成这个任务。否则，我必须使用parse_url手动粘贴在一起。再次感谢乔治。

这里我的实现基于上面的例子/答案。

这是基于类
使用Curl
支持HTTPauthentication
跳过不属于基本域的URL
返回每个页面的Http头响应代码
每页返回时间

CRAWL CLASS：

 class crawler { protected $_url; protected $_depth; protected $_host; protected $_useHttpAuth = false; protected $_user; protected $_pass; protected $_seen = array(); protected $_filter = array(); public function __construct($url, $depth = 5) { $this->_url = $url; $this->_depth = $depth; $parse = parse_url($url); $this->_host = $parse['host']; } protected function _processAnchors($content, $url, $depth) { $dom = new DOMDocument('1.0'); @$dom->loadHTML($content); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } // Crawl only link that belongs to the start domain $this->crawl_page($href, $depth - 1); } } protected function _getContent($url) { $handle = curl_init($url); if ($this->_useHttpAuth) { curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass); } // follows 302 redirect, creates problem wiht authentication // curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE); // return the content curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE); /* Get the HTML or whatever is linked in $url. */ $response = curl_exec($handle); // response total time $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME); /* Check for 404 (file not found). */ $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); curl_close($handle); return array($response, $httpCode, $time); } protected function _printResult($url, $depth, $httpcode, $time) { ob_end_flush(); $currentDepth = $this->_depth - $depth; $count = count($this->_seen); echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>"; ob_start(); flush(); } protected function isValid($url, $depth) { if (strpos($url, $this->_host) === false || $depth === 0 || isset($this->_seen[$url]) ) { return false; } foreach ($this->_filter as $excludePath) { if (strpos($url, $excludePath) !== false) { return false; } } return true; } public function crawl_page($url, $depth) { if (!$this->isValid($url, $depth)) { return; } // add to the seen URL $this->_seen[$url] = true; // get Content and Return Code list($content, $httpcode, $time) = $this->_getContent($url); // print Result for current Page $this->_printResult($url, $depth, $httpcode, $time); // process subPages $this->_processAnchors($content, $url, $depth); } public function setHttpAuth($user, $pass) { $this->_useHttpAuth = true; $this->_user = $user; $this->_pass = $pass; } public function addFilterPath($path) { $this->_filter[] = $path; } public function run() { $this->crawl_page($this->_url, $this->_depth); } }

用法：

 // USAGE $startURL = 'http://YOUR_URL/'; $depth = 6; $username = 'YOURUSER'; $password = 'YOURPASS'; $crawler = new crawler($startURL, $depth); $crawler->setHttpAuth($username, $password); // Exclude path with the following structure to be processed $crawler->addFilterPath('customer/account/login/referer'); $crawler->run();

看看PHP爬虫

http://sourceforge.net/projects/php-crawler/

看看是否有帮助。

为什么使用PHP，当你可以使用wget ，例如

 wget -r -l 1 http://www.example.com

有关如何parsing内容，请参阅parsingHTML的最佳方法，并使用searchfunction作为示例。如何parsingHTML已被多次解答。

对hobodave的代码做一些小的修改，这里是一个你可以用来抓取页面的代码片段。这需要在您的服务器中启用curl扩展。

 <?php //set_time_limit (0); function crawl_page($url, $depth = 5){ $seen = array(); if(($depth == 0) or (in_array($url, $seen))){ return; } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); $result = curl_exec ($ch); curl_close ($ch); if( $result ){ $stripped_file = strip_tags($result, "<a>"); preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER ); foreach($matches as $match){ $href = $match[1]; if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($href , array('path' => $path)); } else { $parts = parse_url($href); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } } echo "Crawled {$href}"; } crawl_page("http://www.sitename.com/",3); ?>

我已经在这个爬虫脚本教程中解释了这个教程

最简单的forms是：

 function crawl_page($url, $depth = 5) { if($depth > 0) { $html = file_get_contents($url); preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches); foreach($matches[1] as $newurl) { crawl_page($newurl, $depth - 1); } file_put_contents('results.txt', $newurl."\n\n".$html."\n\n", FILE_APPEND); } } crawl_page('http://www.domain.com/index.php', 5);

该function将从页面获取内容，然后抓取所有find的链接并将内容保存到“results.txt”。这些函数接受第二个参数depth，它定义了链接应该遵循的时间。如果你只想parsing给定页面的链接，那么传递1。

如前所述，有一些爬虫框架已经可以在那里定制，但是如果你正在做的就像你刚刚提到的那样简单，那么你可以很容易地从头开始。

刮去链接： http : //www.phpro.org/examples/Get-Links-With-DOM.html

将结果转储到文件： http : //www.tizag.com/phpT/filewrite.php

Hobodave你非常接近。我唯一改变的就是if语句，它检查发现的锚标签的href属性是否以'http'开头。而不是简单地添加$ urlvariables，它将包含传入的页面，您必须首先将其剥离到可以使用parse_url php函数完成的主机。

 <?php function crawl_page($url, $depth = 5) { static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } $seen[$url] = true; $dom = new DOMDocument('1.0'); @$dom->loadHTMLFile($url); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { /* this is where I changed hobodave's code */ $host = "http://".parse_url($url,PHP_URL_HOST); $href = $host. '/' . ltrim($href, '/'); } crawl_page($href, $depth - 1); } echo "New Page:<br /> "; echo "URL:",$url,PHP_EOL,"<br />","CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL," <br /><br />"; } crawl_page("http://hobodave.com/", 5); ?>

问题是如何获得ajax调用源代码？这不是爬行，例如如何抓取像这样的链接上的图片？ http://www.tiendeo.nl/Catalogi/amsterdam/16558&subori=web_sliders&buscar=Boni&sw=1366

PHPCrawl是一个非常好的，深思熟虑的爬虫框架。

我使用@ hobodave的代码，这个小小的调整，以防止重新抓取同一个URL的所有片段变体：

 <?php function crawl_page($url, $depth = 5) { $parts = parse_url($url); if(array_key_exists('fragment', $parts)){ unset($parts['fragment']); $url = http_build_url($parts); } static $seen = array(); ...

那么你也可以省略$parts = parse_url($url); 在for循环中。

你可以试试这可能对你有帮助

 $search_string = 'american golf News: Fowler beats stellar field in Abu Dhabi'; $html = file_get_contents(url of the site); $dom = new DOMDocument; $titalDom = new DOMDocument; $tmpTitalDom = new DOMDocument; libxml_use_internal_errors(true); @$dom->loadHTML($html); libxml_use_internal_errors(false); $xpath = new DOMXPath($dom); $videos = $xpath->query('//div[@class="primary-content"]'); foreach ($videos as $key => $video) { $newdomaindom = new DOMDocument; $newnode = $newdomaindom->importNode($video, true); $newdomaindom->appendChild($newnode); @$titalDom->loadHTML($newdomaindom->saveHTML()); $xpath1 = new DOMXPath($titalDom); $titles = $xpath1->query('//div[@class="listingcontainer"]/div[@class="list"]'); if(strcmp(preg_replace('!\s+!',' ', $titles->item(0)->nodeValue),$search_string)){ $tmpNode = $tmpTitalDom->importNode($video, true); $tmpTitalDom->appendChild($tmpNode); break; } } echo $tmpTitalDom->saveHTML();

我如何在PHP中创build一个简单的爬虫？

在其他平台上使用iOS GameKit的“蓝牙Bonjour”

32位到16位浮点转换

我可以使用UriTemplate将非string传递给WCF RESTful服务吗？

ping响应“请求超时”vs“目标主机不可达”

如何在R中可视化大型networking？

如何在网站上findsitemap.xmlpath？

在Python中打开套接字的最佳方法

TeamViewer如此之快？

在OpenCV中增加摄像头捕捉分辨率

我怎样才能使用QNetworkAccessManager POST数据到一个url