Multithreaded simple URL Crawler

Here is a quick program to create X threads that crawl a given URL given a set of results for a database.


#!/usr/bin/env groovy
import groovy.sql.Sql
//
// MultiThreaded query script.
// Runs a query and then submits all the jobs as threads using the Executor.newFixedThreadPool
//
// 3/2008 by George kowalski
//

// How may Threads to allow to run at one time.
def MAX_THREADS = 10

println "Processing started .. Quering Database for RGDIds ... "

def sql = Sql.newInstance("jdbc:oracle:thin:@site.edu:1521:SCHEMA", "USERID", "passwd", "oracle.jdbc.driver.OracleDriver");
def service = java.util.concurrent.Executors.newFixedThreadPool(MAX_THREADS)

def rgdIDList = []

sql.eachRow("select * from genes, rgd_ids where genes.rgd_id = rgd_ids.rgd_id and rgd_ids.object_status = 'ACTIVE' and rgd_ids.species_type_key = 3", { rgdIDList << it.rgd_id })

println "Done with Query we will be processing ${rgdIDList.size} ids "

// Class that is run.
class toRun implements Runnable {
String id
toRun(String newid) {
this.id = newid
}
public void run() {
println "Calling URL with id: ${id}"
def contents = new URL("http://rgddev.mcw.edu/tools/genes/genes_view.cgi?id=${id}").getText()
println "Return from: ${id}"
}
}

for (id in rgdIDList) {
println "Submitting Thread for id: ${id}"
service.execute( new toRun(id.toString()) )
// This is just to slow down display on console, not needed.
Thread.sleep(500)
}

Advertisements