1. This site uses cookies. By continuing to use this site, you are agreeing to our use of cookies. Learn More.

Overzealous robots.txt

Discussion in 'XenForo Questions and Support' started by cmeinck, May 4, 2013.

  1. cmeinck

    cmeinck Well-Known Member

    My robots.txt contains the following and honestly I'm concerned that perhaps much of it is not needed. Thoughts?

    Code:
    User-agent: Mediapartners-Google
    Disallow:
    User-agent: *
    Disallow: /forum/members/
    Disallow: /forum/find-new/
    Disallow: /forum/account/
    Disallow: /forum/attachments/
    Disallow: /forum/goto/
    Disallow: /forum/posts/
    Disallow: /forum/login/
    Disallow: /forum/admin.php
    Disallow: /forum/search/
    Disallow: /forum/misc/style?*
    Disallow: /forum/misc/quick-navigation-menu?*
    Disallow: /forum/index.php?account/
    Disallow: /forum/index.php?find-new/
    Disallow: /forum/index.php?help/
    Disallow: /forum/index.php?goto/
    Disallow: /forum/index.php?login/
    Disallow: /forum/index.php?lost-password/
    Disallow: /forum/index.php?misc/style/
    Disallow: /forum/index.php?online/
    Disallow: /forum/index.php?posts/
    Disallow: /forum/index.php?recent-activity/
    Disallow: /forum/index.php?register/
    Disallow: /forum/index.php?search/
    Disallow: /forum/help/
    Disallow: /forum/lost-password/
    Disallow: /forum/online/
    Disallow: /forum/recent-activity/
    Disallow: /forum/register/
    Disallow: /forum/admin.php
    Allow: /wp-content/uploads
     
     
    User-agent: Googlebot-Image
    Allow: /wp-content/uploads/
     
    User-agent: Adsbot-Google
    Allow: /
     
    User-agent: Googlebot-Mobile
    Allow: /
    
     
  2. Biker

    Biker Well-Known Member

    It's needed. By disallowing certain links, you won't get the Google bot mad at you for tons of prohibited links.
     
  3. Brogan

    Brogan XenForo Moderator Staff Member

    This is the robots.txt file here: http://xenforo.com/robots.txt

    Code:
    User-agent: *
    Disallow: /community/find-new/
    Disallow: /community/account/
    Disallow: /community/attachments/
    Disallow: /community/goto/
    Disallow: /community/posts/
    Disallow: /community/login/
    Disallow: /community/admin.php
    Allow: /
     
  4. ThunderBird2

    ThunderBird2 Member

    Yes, robot.txt. How about .htaccess to add these rewrites?

    Code:
    RewriteEngine On
    RewriteCond %{HTTP_USER_AGENT} ^BlackWidow [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Bot\ mailto:craftbot@yahoo.com [OR]
    RewriteCond %{HTTP_USER_AGENT} ^ChinaClaw [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Custo [OR]
    RewriteCond %{HTTP_USER_AGENT} ^DISCo [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Download\ Demon [OR]
    RewriteCond %{HTTP_USER_AGENT} ^eCatch [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EirGrabber [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EmailSiphon [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EmailWolf [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Express\ WebPictures [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Navroad [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NearSite [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetAnts [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetSpider [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Net\ Vampire [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetZIP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Octopus [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Offline\ Explorer [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Offline\ Navigator [OR]
    RewriteCond %{HTTP_USER_AGENT} ^PageGrabber [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Papa\ Foto [OR]
    RewriteCond %{HTTP_USER_AGENT} ^pavuk [OR]
    RewriteCond %{HTTP_USER_AGENT} ^pcBrowser [OR]
    RewriteCond %{HTTP_USER_AGENT} ^RealDownload [OR]
    RewriteCond %{HTTP_USER_AGENT} ^ReGet [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SiteSnagger [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SmartDownload [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SuperBot [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SuperHTTP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Surfbot [OR]
    RewriteCond %{HTTP_USER_AGENT} ^tAkeOut [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Teleport\ Pro [OR]
    RewriteCond %{HTTP_USER_AGENT} ^VoidEYE [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Web\ Image\ Collector [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Web\ Sucker [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebAuto [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebCopier [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebFetch [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebGo\ IS [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebLeacher [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebReaper [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebSauger [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Website\ eXtractor [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Website\ Quester [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebStripper [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebWhacker [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebZIP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Wget [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Widow [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WWWOFFLE [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Xaldon\ WebSpider [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Zeus
    RewriteRule ^.* - [F,L]
       
       
    # Block Bad Bots & Scrapers
    SetEnvIfNoCase User-Agent "Aboundex" bad_bot
    SetEnvIfNoCase User-Agent "360Spider" bad_bot
    SetEnvIfNoCase User-Agent "^Java" bad_bot
    SetEnvIfNoCase User-Agent "^Cogentbot" bad_bot
    SetEnvIfNoCase User-Agent "^Alexibot" bad_bot
    SetEnvIfNoCase User-Agent "^asterias" bad_bot
    SetEnvIfNoCase User-Agent "^attach" bad_bot
    SetEnvIfNoCase User-Agent "^BackDoorBot" bad_bot
    SetEnvIfNoCase User-Agent "^BackWeb" bad_bot
    SetEnvIfNoCase User-Agent "Bandit" bad_bot
    SetEnvIfNoCase User-Agent "^BatchFTP" bad_bot
    SetEnvIfNoCase User-Agent "^Bigfoot" bad_bot
    SetEnvIfNoCase User-Agent "^Black.Hole" bad_bot
    SetEnvIfNoCase User-Agent "^BlackWidow" bad_bot
    SetEnvIfNoCase User-Agent "^BlowFish" bad_bot
    SetEnvIfNoCase User-Agent "^BotALot" bad_bot
    SetEnvIfNoCase User-Agent "Buddy" bad_bot
    SetEnvIfNoCase User-Agent "^BuiltBotTough" bad_bot
    SetEnvIfNoCase User-Agent "^Bullseye" bad_bot
    SetEnvIfNoCase User-Agent "^BunnySlippers" bad_bot
    SetEnvIfNoCase User-Agent "^Cegbfeieh" bad_bot
    SetEnvIfNoCase User-Agent "^CheeseBot" bad_bot
    SetEnvIfNoCase User-Agent "^CherryPicker" bad_bot
    SetEnvIfNoCase User-Agent "^ChinaClaw" bad_bot
    SetEnvIfNoCase User-Agent "Collector" bad_bot
    SetEnvIfNoCase User-Agent "Copier" bad_bot
    SetEnvIfNoCase User-Agent "^CopyRightCheck" bad_bot
    SetEnvIfNoCase User-Agent "^cosmos" bad_bot
    SetEnvIfNoCase User-Agent "^Crescent" bad_bot
    SetEnvIfNoCase User-Agent "^Custo" bad_bot
    SetEnvIfNoCase User-Agent "^AIBOT" bad_bot
    SetEnvIfNoCase User-Agent "^DISCo" bad_bot
    SetEnvIfNoCase User-Agent "^DIIbot" bad_bot
    SetEnvIfNoCase User-Agent "^DittoSpyder" bad_bot
    SetEnvIfNoCase User-Agent "^Download\ Demon" bad_bot
    SetEnvIfNoCase User-Agent "^Download\ Devil" bad_bot
    SetEnvIfNoCase User-Agent "^Download\ Wonder" bad_bot
    SetEnvIfNoCase User-Agent "^dragonfly" bad_bot
    SetEnvIfNoCase User-Agent "^Drip" bad_bot
    SetEnvIfNoCase User-Agent "^eCatch" bad_bot
    SetEnvIfNoCase User-Agent "^EasyDL" bad_bot
    SetEnvIfNoCase User-Agent "^ebingbong" bad_bot
    SetEnvIfNoCase User-Agent "^EirGrabber" bad_bot
    SetEnvIfNoCase User-Agent "^EmailCollector" bad_bot
    SetEnvIfNoCase User-Agent "^EmailSiphon" bad_bot
    SetEnvIfNoCase User-Agent "^EmailWolf" bad_bot
    SetEnvIfNoCase User-Agent "^EroCrawler" bad_bot
    SetEnvIfNoCase User-Agent "^Exabot" bad_bot
    SetEnvIfNoCase User-Agent "^humanlinks" bad_bot
    SetEnvIfNoCase User-Agent "^IlseBot" bad_bot
    SetEnvIfNoCase User-Agent "^Image\ Stripper" bad_bot
    SetEnvIfNoCase User-Agent "^Image\ Sucker" bad_bot
    SetEnvIfNoCase User-Agent "Indy\ Library" bad_bot
    SetEnvIfNoCase User-Agent "^InfoNaviRobot" bad_bot
    SetEnvIfNoCase User-Agent "^InfoTekies" bad_bot
    SetEnvIfNoCase User-Agent "^Intelliseek" bad_bot
    SetEnvIfNoCase User-Agent "^InterGET" bad_bot
    SetEnvIfNoCase User-Agent "^Internet\ Ninja" bad_bot
    SetEnvIfNoCase User-Agent "^Iria" bad_bot
    SetEnvIfNoCase User-Agent "^Jakarta" bad_bot
    SetEnvIfNoCase User-Agent "^JennyBot" bad_bot
     
    # Vulnerability Scanners
    SetEnvIfNoCase User-Agent "Acunetix" bad_bot
    SetEnvIfNoCase User-Agent "FHscan" bad_bot
     
    # Aggressive Chinese Search Engine
    SetEnvIfNoCase User-Agent "Baiduspider" bad_bot
     
    # Aggressive Russian Search Engine
    SetEnvIfNoCase User-Agent "Yandex" bad_bot
     
     
    <Limit GET POST HEAD>
    Order Allow,Deny
    Allow from all
     
    # Cyveillance
    deny from 38.100.19.8/29
    deny from 38.100.21.0/24
    deny from 38.100.41.64/26
    deny from 38.105.71.0/25
    deny from 38.105.83.0/27
    deny from 38.112.21.140/30
    deny from 38.118.42.32/29
    deny from 65.213.208.128/27
    deny from 65.222.176.96/27
    deny from 65.222.185.72/29
     
    Deny from env=bad_bot
    </Limit>
       
    # IF THE UA STARTS WITH THESE
    RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR]
     
    # STARTS WITH WEB
    RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR]
     
    # ANYWHERE IN UA -- GREEDY REGEX
    RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC]
     
    RewriteCond %{HTTP_USER_AGENT} ^.*(Ahrefs|Baidu|BlogScope|Butterfly|DCPbot|discoverybot|domain|Ezooms|ImageSearcherFree).*$ [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^.*(ips-agent|linkdex|MJ12|Netcraft|NextGenSearchBot|SISTRIX|Sogou|soso|TweetmemeBot|Unwind|Yandex).*$ [NC]
    RewriteRule ^/?.*$ "http\:\/\/127\.0\.0\.1" [R,L]
     
    # ISSUE 403 / SERVE ERRORDOCUMENT
    RewriteRule . - [F,L]
     
     
     
    RewriteEngine on
    Options +FollowSymlinks
    RewriteBase /
     
    RewriteCond %{REMOTE_HOST} ^xxx.xxx.xxx..* [OR] ---> put IP of bad bots
    RewriteCond %{HTTP_USER_AGENT} badbot1 [OR] ---> or bad bot name/url
    RewriteCond %{HTTP_USER_AGENT} badbot2
    RewriteRule ^.*$ redirect.html [L] 

    Anyone know these problems with friendly URL ? Do I need these all of these codes?
     

Share This Page