Technology•May 29, 2014
Powers of Ten – Part I
$ curl -L -O http://snap.stanford.edu/data/wiki-Vote.txt.gz
$ gunzip wiki-Vote.txt.gz
g = TitanFactory.open('/tmp/1m')
g.makeKey('userId').dataType(String.class).indexed(Vertex.class).unique().make()
g.makeLabel('votesFor').make()
g.commit()
getOrCreate = { id ->
def p = g.V('userId', id)
if (p.hasNext()) ? p.next() : g.addVertex([userId:id])
}
new File('wiki-Vote.txt').eachLine {
if (!it.startsWith("#")){
(fromVertex, toVertex) = it.split('\t').collect(getOrCreate)
fromVertex.addEdge('votesFor', toVertex)
}
}
g.commit()
$ bin/gremlin.sh
\,,,/
(o o)
-----oOOo-(_)-oOOo-----
gremlin> \. load-1m.groovy
==>titangraph[local:/tmp/1m]
==>userId
...
==>null
gremlin> g.V.count()
==>7115
gremlin> g.E.count()
==>103689
$ curl -L -O http://downloads.cms.gov/foia/physician-referrals-2012-2013-days30.zip
$ unzip physician-referrals-2012-2013-days30.zip && rm physician-referrals-2012-2013-days30.zip
$ head -n3 Physician-Referrals-2012-2013-DAYS30.txt
$ sort Physician-Referrals-2012-2013-DAYS30.txt > Physician-Referrals-2012-2013-DAYS30-sorted.txt
conf = new BaseConfiguration() {{
setProperty("storage.backend", "berkeleyje")
setProperty("storage.directory", "/tmp/10m")
setProperty("storage.batch-loading", true)
}}
g = TitanFactory.open(conf)
g.makeKey("npi").dataType(String.class).single().unique().indexed(Vertex.class).make()
sharedTransactionCount = g.makeKey("sharedTxCount").dataType(Integer.class).make()
patientTotal = g.makeKey("patientTotal").dataType(Integer.class).make()
sameDayTotal = g.makeKey("sameDayTotal").dataType(Integer.class).make()
g.makeLabel("shares").signature(sharedTransactionCount, patientTotal, sameDayTotal).make()
g.commit()
bg = new BatchGraph(g, VertexIDType.STRING, 10000)
bg.setVertexIdKey("npi")
c = 0L
new File("Physician-Referrals-2012-2013-DAYS30-sorted.txt").eachLine({ final String line ->
def (id1,
id2,
sharedTransactionCount,
patientTotal,
sameDayTotal) = line.split(',')*.trim()
def v1 = bg.getVertex(id1) ?: bg.addVertex(id1)
def v2 = bg.getVertex(id2) ?: bg.addVertex(id2)
def edge = bg.addEdge(null, v1, v2, "shares")
edge.setProperty("sharedTxCount", sharedTransactionCount as Integer)
edge.setProperty("patientTotal", patientTotal as Integer)
edge.setProperty("sameDayTotal", sameDayTotal as Integer)
if (++c%100000L == 0L) println "Processed ${c} edges"
})
bg.commit()