Есть такой проект. http://sqid.rubyforge.org/ Скрипт ищет скули по запросу гугла. Начиная от SyBase заканчивая MySQL. Последнее время у меня что то не пашет не фига. Не может найти не 1 ссылки проверил может гугл что то поменяд нет все так же... вроде... не могу понять в чем функция поиска Хотел бы уточнить что скирпт работает через Code: google.com/xhtml/search?mrestrict=xhtml&site=search&q={qeury}&start={result_start}&sa=N Code: def search puts "[+] Getting %d links from search %s starting from %d." % [ @nos, @query, @start ] http_obj = SqidHTTP.new("http://www.google.com",@http_opts) upto_in_10s(@nos).each do |maxResults| search_str="/xhtml/search?mrestrict=xhtml&site=search&q=" + URI.escape(@query) + "&start=" + String(@start) + "&sa=N" http_obj.path = search_str http_res = http_obj.get return if http_res == nil http_res.body.gsub!(/[\r][\n]?/,"") http_res.body.scan(/href\s*=\s*\"*[^\">]*/) { |t| @urls.push(t.split("&u=")[1]) } @start+=maxResults end @urls.compact!() @urls.collect! {|url| URI.unescape(url)} puts "[+] Done got %d links." % @urls.size end def do check end end class SqidPAGE < SqidURL def initialize(http_opts,sqid_opts) @http_opts = http_opts @page = sqid_opts['page'] super(http_opts,sqid_opts) end def do puts "[+] Getting links from page %s.\n" % @page get_links(@page) { |link| @urls.push(link) } puts "[+] Done got %d links." % @urls.size super end def get_links(url) return if url == nil a = SqidHTTP.new(url,@http_opts) r = a.get page ="" page = r.body if r yield_url = URI.parse(url) base_path = URI.parse(url).path base_path = base_path[1...base_path.length] if base_path.length >= 1 page.scan(/href\s*=\s*["']([^"']+)["']/i) { |link| link=URI.escape(link[0]) begin u = URI.parse("#{link}") if u.scheme next if !( u.scheme =~ /http[s]?:\/\//i ) end if u.relative? next if u.path == "/" next if u.path.rindex("..") if u.path[0] == "/" yield_url.path = u.path yield yield_url.to_s end t = base_path.split("/") if hasext? base_path if t.length > 1 t = t[0...t.length-1] end end add_path=t.join("/") yield_url.path = "/" + add_path + "/" + u.path yield yield_url.to_s if not skipurl?(yield_url.to_s) else yield link if not skipurl?(link) end rescue puts "[*] Invalid URL: " + $!.message next end } end end Скачать можно тут hxxp://rubyforge.org/frs/?group_id=2617
Дебаг режим включается тут #@@HTTP.set_debug_output $stderr Необходимо заменить регулярки Code: http_res.body.gsub!(/[\r][\n]?/,"") http_res.body.scan(/href\s*=\s*\"*[^\">]*/) { |t| Пример исходного кода помогите составить регулярки я в кодинге вообще не в зуб ногой... Code: <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8"/> <meta http-equiv="Cache-Control" content="no-cache"/> <title> inurl:id= news filetype:asp - Google Search </title> <style type="text/css">.n{text-align:center;padding-top:12px;font-size:x-small;white-space:nowrap}.j{font-size:x-small;text-align:center}.i{margin:2px;padding:2px;text-align:center;font-size:x-small}.i div{margin:4px auto}.c{margin:4px auto;font-size:x-small}.g{background-color:#ebeff9;border-top:1px solid #36c;margin:2px 0 4px 0;padding:2px 2px 4px 2px}.d{padding-right:15px}.l{padding-right:15px;font-weight:bold}.h{font-family:Arial,Helvetica,sans-serif;font-size:small}.k{text-align:center;padding:4px;background-color:#eff3fa;border-top:1px solid #36c;border-bottom:1px solid #36c}.m{margin:4px 0 2px;text-align:center}.m a{padding:0 5px}a{color:#20c}.e{vertical-align:middle;border:0}#query_box_bottom_form{text-align:center;padding:4px;background-color:#eff3fa}.b{padding:5px 0 5px 1px}.f{margin-bottom:0;padding-bottom:0}.a{color:#008000} </style> </head> <body class="h"> <div> <div> <div> <form class="f" id="query_box_top_form" action="/xhtml/search"> <div><input type="hidden" name="site" value="search"/><input type="hidden" name="mrestrict" value="xhtml"/><img class="e e" src="http://www.gstatic.com/m/images/logo_small.gif" width="48" height="17" alt="Google"/> <br/> <input id="query_box_top_textbox" type="text" name="q" size="16" value="inurl:id= news filetype:asp"/> <input id="query_box_top_submit" type="submit" value="Search"/> </div> </form> </div> </div> <div class="g" id="navbar"> <div><span class="l">Web</span><a class="d" href="/m/search?mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp&site=images&tab=wi&sa=N" >Images</a><a class="d" href="/m/local?mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp&site=local&tab=wl&sa=N" >Local</a><a class="d" href="/m/news?mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp&tab=wn&sa=N" >News</a> </div> </div> <div id="universal"> <div> <div class="b"><a href="http://www.hsri.org/index.asp?id=news" >Latest HSRI <b>News</b> and Publications</a> <br/><span id="snip_0">Jul 29, 2009 .. 5/2009 <b>News</b>: An article featuring NCI data was recently published in the April 2009 issue of the ..</span><span class="a"> <br/>www.hsri.org/index.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://royalsociety.org/news.asp?id=8734" >Stop emitting CO2 or geoengineering could be our only hope</a> <br/><span id="snip_1">Aug 28, 2009 .. Science in the <b>News</b> · Education in the <b>News</b> · Press releases · The Royal Society press office ..</span><span class="a"> <br/>royalsociety.org/news.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.isuppli.com/news/default.asp?id=6919" >iSuppli Corporation : Applied Market Intelligence</a> <br/><span id="snip_2">Nov 16, 2006 .. Now for the good <b>news</b>: iSuppli Corp.'s dissection reveals the PlayStation 3 is an engineering ..</span><span class="a"> <br/>www.isuppli.com/news/default.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.pnl.gov/news/release.asp?id=383" >PNNL: <b>News</b> - New geothermal heat extraction process to deliver <b>...</b></a> <br/><span id="snip_3">Jul 15, 2009 .. Search PNNL. PNNL Home · About · Research · Publications · Jobs · <b>News</b> · Contacts. PNNL <b>News</b> Central ..</span><span class="a"> <br/>www.pnl.gov/news/release.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.amadirectlink.com/news/story.asp?id=595" >AMA letter to CPSC regarding OHV lead content</a> <br/><span id="snip_4">Feb 4, 2009 .. The Life; <b>News</b> & Events · Rights .. URL for this article: www.amadirectlink.com /<b>news</b>/story.asp ..</span><span class="a"> <br/>www.amadirectlink.com/news/story.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.globalhealthreporting.org/news.asp?id=826" ><b>News</b> Summaries - Kaiser Global Health</a> <br/><span id="snip_5">The Kaiser Family Foundation presents Kaiser Global Health - an interactive resource tracking global health <b>news</b> and ..</span><span class="a"> <br/>www.globalhealthreporting.org/news.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.af.mil/news/story.asp?id=123012131" >Air Force testing new transparent armor</a> <br/><span id="snip_6">Oct 17, 2005 .. Engineers here are testing a new kind of transparent armor -- stronger and lighter than traditional ..</span><span class="a"> <br/>www.af.mil/news/story.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.trb.org/news/blurb_detail.asp?id=8794" >The Potential Impacts of Climate Change on U.S. Transportation <b>...</b></a> <br/><span id="snip_7">TRB Special Report 290: The Potential Impacts of Climate Change on US Transportation explores the consequences of ..</span><span class="a"> <br/>www.trb.org/news/blurb_detail.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.sacurrent.com/news/story.asp?id=69607" >SA Current - <b>NEWS</b>+FEATURES: The panopticon economy</a> <br/><span id="snip_8">The latest <b>news</b> in Uncle Sam's ongoing surveillance scandal happens to come from the FBI's involvement with the NSA. ..</span><span class="a"> <br/>www.sacurrent.com/news/story.asp</span> <br/> </div> </div> <div> <div class="b"><a href="http://www.olapwork.com/news/press_release.asp?id=20070924_006494" >SAP - Business Objects Turns Text Into Insight</a> <br/><span id="snip_9">Delivers Only Solution Integrating Text Analysis and Search With End-to-End Business Intelligence. SAN JOSE, Calif. ..</span><span class="a"> <br/>www.olapwork.com/../press_release.asp</span> <br/> </div> </div> </div> <div class="m"><a href="/xhtml/search?site=search&mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp&start=10&sa=N" >Next page »</a> </div> </div> <div> <div class="k"> <form class="f" id="query_box_bottom_form" action="/xhtml/search"> <div><input type="hidden" name="mrestrict" value="xhtml"/><input type="hidden" name="sa" value="2"/> <input id="query_box_bottom_textbox" type="text" name="q" size="18" value="inurl:id= news filetype:asp"/> <select id="query_box_bottom_dropdown" name="site"> <option value="universal" selected="selected"> Web </option> <option value="images"> Images </option> <option value="news"> News </option> <option value="local"> Local </option> <option value="products"> Products </option> <option value="blogs"> Blogs </option> <option value="mobile"> Mobile </option> </select> <input id="query_box_bottom_submit" type="submit" value="Search"/> </div> </form> </div> <div class="n" id="user_info"><a href="https://www.google.com/accounts/ServiceLogin?service=mobile&passive=true&cd=US&hl=en&continue=http%3A%2F%2Fwww.google.com%2Fxhtml%2Fsearch%3Fsite%3Dsearch%26mrestrict%3Dxhtml%26q%3Dinurl%253Aid%253D%2Bnews%2Bfiletype%253Aasp&ltmpl=m&btmpl=mobile" >Sign in</a> </div> <div class="i"> <div class="c"><a href="/m?mrestrict=xhtml&sa=N" >Google Home</a> - <a href="/m/ig?mrestrict=xhtml&sa=N" >iGoogle</a> </div> <div class="c"><a href="/xhtml/search?site=search&mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp&action=sets" >Settings</a> - <a href="/m/survey/feedback?mrestrict=xhtml&q=inurl%3Aid%3D+news+filetype%3Aasp" >Feedback</a> - <a href="/m/help?mrestrict=xhtml&sa=N" >Help</a> </div> <div class="c">View Google in: <br/>Mobile | <a href="/search?q=inurl%3Aid%3D+news+filetype%3Aasp" >Classic</a> </div> </div> <div class="j">©2009 - <a href="http://m.google.com/static/en/privacy.html" >Privacy</a> </div> </div> <div><img src="/m/ping?ust=1256018683599&page=srp" width="1" height="1" alt=""/> </div> </body> </html>
какую версию используешь? только что скачал версию 0.3 проверил. в гугл ходит, ссылки парсит, даже чтото искать пытается. зы очередной кавычкоподставлятельский выводоошибкоожидательский скрипт.