<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: web crawling for google only in General Topics</title>
    <link>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24759#M18044</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;There is no easy way to have the googlebot identified in a rule so that it would be the only one allowed to crawl.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Google does provide a way to verify the bot via reverse dns lookup of the bot ip address.&amp;nbsp; Here you check the source of the crawler and lookup in DNS to a specific google subdomain as outlined in this document.&amp;nbsp; The issue is there is no way to have this kind of check in a PA rule at this point.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://support.google.com/webmasters/answer/80553?hl=en" title="https://support.google.com/webmasters/answer/80553?hl=en"&gt;Verifying Googlebot - Webmaster Tools Help&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Google bot also uses a specified user agent string.&amp;nbsp; So you could create a custom vulnerability signature to look for hte string.&amp;nbsp; There are two issues with this method.&amp;nbsp; One it is a good way to block but not a way to permit only the hits.&amp;nbsp; And two there are people faking these user agents since they know they are trusted and would pass the test even without being the real deal.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://support.google.com/webmasters/answer/1061943" title="https://support.google.com/webmasters/answer/1061943"&gt;Google crawlers - Webmaster Tools Help&lt;/A&gt;&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Thu, 26 Jun 2014 01:08:31 GMT</pubDate>
    <dc:creator>pulukas</dc:creator>
    <dc:date>2014-06-26T01:08:31Z</dc:date>
    <item>
      <title>web crawling for google only</title>
      <link>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24758#M18043</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;I know this topic has been discussed before but there is never a clear answer. It seems it is not possible to allow only specific web crawlers such as google. If that's the case, I assume most of you have web-crawling enabled for your site only? Google is still getting blocked from crawling our site. I was hesitant to enable web-crawling but it sounds like that's the only way its going to work.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Mon, 23 Jun 2014 23:18:10 GMT</pubDate>
      <guid>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24758#M18043</guid>
      <dc:creator>bino150</dc:creator>
      <dc:date>2014-06-23T23:18:10Z</dc:date>
    </item>
    <item>
      <title>Re: web crawling for google only</title>
      <link>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24759#M18044</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;There is no easy way to have the googlebot identified in a rule so that it would be the only one allowed to crawl.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Google does provide a way to verify the bot via reverse dns lookup of the bot ip address.&amp;nbsp; Here you check the source of the crawler and lookup in DNS to a specific google subdomain as outlined in this document.&amp;nbsp; The issue is there is no way to have this kind of check in a PA rule at this point.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://support.google.com/webmasters/answer/80553?hl=en" title="https://support.google.com/webmasters/answer/80553?hl=en"&gt;Verifying Googlebot - Webmaster Tools Help&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Google bot also uses a specified user agent string.&amp;nbsp; So you could create a custom vulnerability signature to look for hte string.&amp;nbsp; There are two issues with this method.&amp;nbsp; One it is a good way to block but not a way to permit only the hits.&amp;nbsp; And two there are people faking these user agents since they know they are trusted and would pass the test even without being the real deal.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://support.google.com/webmasters/answer/1061943" title="https://support.google.com/webmasters/answer/1061943"&gt;Google crawlers - Webmaster Tools Help&lt;/A&gt;&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Thu, 26 Jun 2014 01:08:31 GMT</pubDate>
      <guid>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24759#M18044</guid>
      <dc:creator>pulukas</dc:creator>
      <dc:date>2014-06-26T01:08:31Z</dc:date>
    </item>
    <item>
      <title>Re: web crawling for google only</title>
      <link>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24760#M18045</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Thank you.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 09 Jul 2014 04:50:53 GMT</pubDate>
      <guid>https://live.paloaltonetworks.com/t5/general-topics/web-crawling-for-google-only/m-p/24760#M18045</guid>
      <dc:creator>bino150</dc:creator>
      <dc:date>2014-07-09T04:50:53Z</dc:date>
    </item>
  </channel>
</rss>

