Shall I compare thee to a summer's... ?
Shall I compare thee to a summer's ... ?- first hit
Romeo, Romeo ...- tough with only the start
Hedgehog- really, did you expect it to work?
$ grep summer\'s.day shaks12.txt
Shall I compare thee to a summer's day?
as you shall see in a summer's day. But it is very well; what he
desire in a summer's day. Here is his Majesty.
Like stinging bees in hottest summer's day,
$ grep -A14 "Shall I compare thee to a summer's day?" shaks12.txt Shall I compare thee to a summer's day? Thou art more lovely and more temperate: Rough winds do shake the darling buds of May, And summer's lease hath all too short a date: [...]
:top
/^[0-9]\+\r$/ b title
/^ACT/ { x; s/ACT.*$//; s/.$//; G; x }
/hedgehog/ { x; p; x; p; i\
}
d # goes to top
:title
n; /^.$/ n; x; d; b top;
$ sed -n -f find.sed shaks12.txt
A MIDSUMMER NIGHT'S DREAM
ACT II. SCENE I.
Thorny hedgehogs, be not seen;
[...]
Romeo, Romeo ...again
Cousins, that strive by factions and by friends ambitiously for rule and empery ...
$ grep cousins.*factions shaks12.txt
$ grep cousins.*friends shaks12.txt
$ grep cousins.*empery shaks12.txt
$ grep friends.*empery shaks12.txt
$ grep factions.*friends shaks12.txt MARCUS. Princes, that strive by factions and by friends
class Handler(xml.sax.handler.ContentHandler):
def __init__(self, filename):
self.play = ''
self.act = ''
self.scene = ''
self.state = '' # we'll do this using strings
self.doc = xapian.Document()
self.position = 1
self.filename = filename
self.stemmer = xapian.Stem("en")
self.act_count = 0
self.scene_count = 0
class Handler(xml.sax.handler.ContentHandler):
def startElement(self, name, attrs):
if name=='PLAY':
self.state='play'
elif name=='PERSONAE':
self.state='personae'
elif name in ('INDUCT', 'ACT', 'PROLOGUE', 'EPILOGUE'):
self.act_count += 1
self.state='act'
elif name=='SCENE':
self.scene_count += 1
self.state='scene'
elif name=='TITLE':
if self.state in ('play', 'personae', 'act', 'scene'):
self.state="%s-title" % self.state
elif name=='STAGEDIR':
self.state='stagedir'
elif name=='SPEAKER':
self.state='speaker'
elif name=='LINE':
self.state='line'
class Handler(xml.sax.handler.ContentHandler):
def characters(self, content):
if self.state=='play-title':
self.play += ' ' + content
elif self.state=='personae-title':
pass
elif self.state=='act-title':
self.act += ' ' + content
elif self.state=='scene-title':
self.scene += ' ' + content
elif self.state=='stagedir' or self.state=='line':
self.index_text(content)
elif self.state=='speaker':
self.index_text(content)
self.index_text(content, 'XS')
class Handler(xml.sax.handler.ContentHandler):
def endElement(self, name):
if name=='LINE':
self.state='scene'
elif name=='SPEAKER':
self.state='scene'
elif name=='STAGEDIR':
self.state='scene'
elif name=='TITLE':
if self.state[-6:]=='-title':
self.state=self.state[:-6]
elif name=='SCENE':
unique_term = self.add_stuff() # index titles, create doc data
get_db().replace_document(unique_term, self.doc)
self.doc = xapian.Document()
self.scene = ''
elif name=='ACT':
self.act = ''
elif name=='PLAY':
self.play = ''
class Handler(xml.sax.handler.ContentHandler):
def add_stuff(self):
self.index_text(self.play, 'S', 10)
self.index_text(self.act, 'S', 10)
self.index_text(self.scene, 'S', 10)
self.index_text(self.scene, '', 10)
data = "play=%s\nact=%s\nscene=%s\nfilename=%s" % (safe(self.play),
safe(self.act), safe(self.scene), self.filename)
self.doc.set_data(data)
unique_term = "Q-%s-%i-%i" % (self.filename, self.act_count, self.scene_count)
self.doc.add_term(unique_term)
return unique_term
# get_db() is a singleton WritableDatabase constructor:
# xapian.WritableDatabase('shakespeare', xapian.DB_CREATE_OR_OPEN)
def safe(s): # Get rid of characters we can't carry in document data
return str(s).replace("\n", " ")
def p_plusminus(c):
return c=='+' or c=='-'
class Handler(xml.sax.handler.ContentHandler):
def index_text(self, text, prefix='', position_inc=0, wdfinc=1):
...
This is pretty big.
If you're really interested, ask me afterwards.
if __name__ == "__main__":
import sys
for fname in sys.argv[1:]:
index_file(fname)
Let's go!
def auto_enquote(x):
if x.find(" ")==-1:
return x
else:
return '"' + x + '"'
if __name__ == "__main__":
import sys
query = " ".join(map(auto_enquote, sys.argv[1:]))
qp = xapian.QueryParser()
qp.add_prefix("title", "S")
qp.add_prefix("speaker", "XS")
qp.set_stemmer(xapian.Stem("en"))
qp.set_stemming_strategy(xapian.QueryParser.STEM_ALL)
qp.set_default_op(xapian.Query.OP_OR)
db = xapian.Database("shakespeare")
qp.set_database(db)
q = qp.parse_query(query)
enq = xapian.Enquire(db)
enq.set_query(q)
mset = enq.get_mset(0, 10)
for match in mset:
data = match[4].get_data()
def r(x, y):
x[y.split('=')[0]]=y.split('=')[1]
return x
f = reduce(r, data.split('\n'), {})
print "%2.2i (%3.3i%%) %s, %s, %s" % (match[2]+1,
match[3],
f['play'],
f['act'],
f['scene'])
$ ./search.py cousins that strive by factions and by friends 01 (080%) The Life of Timon of Athens, ACT III, SCENE V. The same. The senate-house. The Senate sitting. 02 (067%) The Tragedy of Titus Andronicus, ACT I, SCENE I. Rome. Before the Capitol. [...]
class Handler(xml.sax.handler.ContentHandler):
def __init__(self, filename):
[...]
self.speech_count = 0
self.speech = ''
def add_stuff(self):
[...]
data = "play=%s\nact=%s\nscene=%s\nspeech=%s\nfilename=%s" % (safe(self.play),
safe(self.act),
safe(self.scene),
safe(self.speech),
self.filename)
unique_term = "Q-%s-%i-%i-%i" % (self.filename, self.act_count,
self.scene_count, self.speech_count)
def startElement(self, name, attrs):
[...]
elif name=='STAGEDIR' or name=='SPEECH':
if self.state!='speech':
self.speech_count += 1
self.state='speech'
class Handler(xml.sax.handler.ContentHandler):
def endElement(self, name):
if name=='LINE':
self.state='speech'
elif name=='SPEAKER':
self.state='speech'
elif name=='STAGEDIR':
if self.state=='speech':
self.add_doc()
self.speech = ''
self.state = 'scene'
else:
self.state='speech'
elif name=='SPEECH':
self.add_doc()
self.speech = ''
self.state = 'scene'
elif name=='SCENE':
self.scene = ''
[...]
class Handler(xml.sax.handler.ContentHandler):
def characters(self, content):
[...]
elif self.state=='stagedir' or self.state=='line':
self.index_text(content)
self.speech += "\n" + content
elif self.state=='speaker':
self.index_text(content)
self.index_text(content, 'XS')
self.speech += "\n" + content
def add_doc(self):
unique_term = self.add_stuff()
get_db().replace_document(unique_term, self.doc)
self.doc = xapian.Document()
import textwrap
print "%2.2i (%3.3i%%) %s, %s, %s\n%s" % (match[2]+1,
match[3],
f['play'],
f['act'],
f['scene'],
"\n".join(textwrap.wrap(f['speech'])))
$ ./search.py cousins that strive by factions and by friends 01 (083%) The Tragedy of Titus Andronicus, ACT I, SCENE I. Rome. Before the Capitol. MARCUS ANDRONICUS Princes, that strive by factions and by friends Ambitiously for rule and empery, Know that the people of Rome, for whom we stand A special party, have, by common voice, In election for the Roman empery, Chosen Andronicus, surnamed Pius For many good and great deserts to Rome: A nobler man, a braver warrior, Lives not this day within the city walls: He by the senate is accit'd home From weary wars against the barbarous Goths; That, with his sons, a terror to our foes, Hath yoked a nation strong, train'd up in arms. Ten years are spent since first he undertook This cause of Rome and chastised with arms Our enemies' pride: five times he hath return'd Bleeding to Rome, bearing his valiant sons In coffins from the field; And now at last, laden with horror's spoils, Returns the good Andronicus to Rome, Renowned Titus, flourishing in arms. Let us entreat, by honour of his name, Whom worthily you would have now succeed. And in the Capitol and senate's right, Whom you pretend to honour and adore, That you withdraw you and abate your strength; Dismiss your followers and, as suitors should, Plead your deserts in peace and humbleness. [...]