define.py - given a carrel and a word, output definitions of the word

#!/usr/bin/env python

# define.py - given a carrel and a word, output definitions of the word

# tldnr - given lists of sentences and a word, use the Lesk algorithm to 
# first disambiguate the word, and then output the word's WordNet definition

#  sample usage: $ ./define.py homer Hector
# sample output:
#
#      synset: hector.n.01
#   frequency: 396
#  definition: (Greek mythology) a mythical Trojan who was killed by Achilles
#              during the Trojan War
#
#      synset: strong-arm.v.02
#   frequency: 56
#  definition: be bossy towards

#  sample usage: $ ./define.py frankenstein fiend
# sample output:
#
#      synset: monster.n.04
#   frequency: 36
#  definition: a cruel wicked and inhuman person
#
#      synset: fanatic.n.01
#   frequency: 4
#  definition: a person motivated by irrational enthusiasm (as for a cause);
#              --Winston Churchill
#
#      synset: devil.n.02
#   frequency: 1
#  definition: an evil supernatural being


# Eric Lease Morgan <[email protected]>
# (c) University of Notre Dame; distributed under a GNU Public License

# March   27, 2023 - first cut
# June     6, 2023 - added command line input
# November 5, 2023 - used more intelligent tokenization, and greatly refined output


# configure
LIBRARY = 'localLibrary'
VERBOSE = False

# require
from   nltk.wsd import lesk
from   nltk     import word_tokenize
from   rdr      import configuration, ETC, SENTENCES, Sentences
import sys

# get input
if len( sys.argv ) != 3 : sys.exit( "Usage: " + sys.argv[ 0 ] + " <carrel> <word>" )
carrel = sys.argv[ 1 ]
word   = sys.argv[ 2 ]

# initialize
library   = configuration( LIBRARY )
sentences = library/carrel/ETC/SENTENCES

# get and process each sentence; create a set of matching results
results = [] 
for sentence in Sentences( sentences ) : 
	
	# normalize
	sentence = sentence.rstrip()
	
	# filter
	if word.lower() in sentence.lower() :
					
		# disambiguate; the magic happens here
		synset = lesk( word_tokenize( sentence ), word )
		
		# update, conditionally
		if synset : results.append( ( synset, sentence ) )
	
# count & tabulate the results
synsets = {}
for result in results :

	# parse
	synset = result[ 0 ]
	
	# count and tabulate
	if synset in synsets : synsets[ synset ] += 1
	else                 : synsets[ synset ] =  1

# sort and process each resulting synset; tricky
synsets = dict( sorted( synsets.items(), key=lambda x:x[ 1 ], reverse=True ) )
for synset in synsets.keys() :
	
	# get the list of matching sentences; pythonic
	sentences = [ result[ 1 ] for result in results if result[ 0 ] is synset ]
		
	# output
	sys.stderr.write( '      synset: ' + synset.name()            + '\n' )	
	sys.stderr.write( '   frequency: ' + str( synsets[ synset ] ) + '\n' )	
	sys.stderr.write( '  definition: ' + synset.definition()      + '\n' )
	
	# ouput some more
	if VERBOSE :
		for sentence in sentences :
			sys.stderr.write( '    sentence: ' + sentence     + '\n' )
	
	# delimit
	sys.stderr.write( '\n' )

# done
exit()

Creator: Eric Lease Morgan <[email protected]>
Source: This is the original publication of this item
Date created: 2023-11-06
Date updated: 2023-11-06
Subject(s): hacks;
URL: https://distantreader.org/blog/define/