modnlp.idx.database
public class Dictionary extends java.lang.Object
MakeTECIndex
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
NOITEMS_LABEL |
static java.lang.String |
TTOKENS_LABEL |
static java.lang.String |
TTRATIO_LABEL |
protected boolean |
verbose |
Constructor and Description |
---|
Dictionary()
Open a new
Dictionary in read-only mode with default
DictProperties ("dictionary.properties" in current directory or,
failing that, hardcoded defaults). |
Dictionary(boolean write)
Open a new
Dictionary . |
Dictionary(boolean write,
DictProperties dp)
Open a new
Dictionary . |
Dictionary(DictProperties dp)
Open a new
Dictionary in read-only mode with
DictProperties dp. |
Modifier and Type | Method and Description |
---|---|
int |
addToDictionary(TokenMap tm,
java.lang.String fou)
Add each token in tm (extracted from fou) to the index
N.B.: currently, addToDictionary operations aren't atomic; if the
program crashes the index could be left in an inconsistent
state.
|
void |
cleanLog() |
void |
close() |
void |
compress() |
void |
dump() |
void |
finalize() |
java.util.Vector |
getAllFileNames(java.lang.String key)
Return a vector containing all filenames where KEY
occurs in the corpus.
|
CaseTable |
getCaseTable() |
java.lang.String |
getCorpusDir() |
DictProperties |
getDictProps() |
com.sleepycat.je.Environment |
getEnvironment() |
java.lang.String |
getExtract(java.lang.String fn,
int ctx,
long offset,
boolean ignx) |
FrequencyHash |
getFileFrequencyTable(int fno,
boolean nocase) |
int |
getFileKey(java.lang.String fou) |
FileTable |
getFileTable() |
int |
getFrequency(WordForms wforms) |
int |
getFrequency(WordForms wforms,
SubcorpusConstraints sbc) |
int[] |
getIndexedFileKeys() |
java.lang.String |
getIndexedFileName(int k) |
java.lang.String[] |
getIndexedFileNames() |
LogStream |
getLogStream() |
IntegerSet |
getOccurringFiles(java.lang.String word) |
int |
getTotalNoOfTokens() |
double |
getTypeTokenRatio() |
double |
getTypeTokenRatio(boolean nocase) |
static double |
getTypeTokenRatio(int notypes,
int notokens) |
boolean |
getVerbose() |
void |
init(boolean write) |
boolean |
isIndexed(java.lang.String fou)
Check if file or URI
fou is in the index. |
boolean |
matchConcordance(PrepContextQuery pcq,
int pos,
int[] posa)
matchConcordance match cline against
this query (represented after parseQuery() by
queryArray and intervArray ) |
void |
printConcordances(WordQuery query,
int ctx,
boolean ignx,
java.io.PrintWriter os) |
void |
printConcordances(WordQuery query,
int ctx,
boolean ignx,
java.io.PrintWriter os,
SubcorpusConstraints sbc) |
void |
printCorpusStats(java.io.PrintWriter os) |
void |
printCorpusStats(java.io.PrintWriter os,
boolean nocase) |
void |
printNoItems(java.io.PrintWriter os,
int noitems) |
void |
printSortedFreqList(java.io.PrintWriter os)
Print the entire frequency list onto os
|
void |
printSortedFreqList(java.io.PrintWriter os,
int max)
Print the max topmost frequent types onto os.
|
void |
printSortedFreqList(java.io.PrintWriter os,
int from,
int max,
boolean nocase)
Print the max topmost frequent types onto os.
|
void |
printSortedFreqList(java.io.PrintWriter os,
int from,
int max,
SubcorpusConstraints sbc,
boolean nocase)
Print the max topmost frequent types occurring in the subcorpus
denoted by sbc onto os.
|
static void |
printSubCorpusStats(java.io.PrintWriter os,
int notypes,
int notokens) |
void |
removeFromDictionary(java.lang.String fou)
removeFromDictionary de-indexes file or URL
fou
N.B.: currently, removeFromDictionary operations aren't atomic;
if the program crashes the index could be left in an inconsistent
state. |
void |
setVerbose(boolean v) |
void |
sync() |
public static final java.lang.String TTOKENS_LABEL
public static final java.lang.String TTRATIO_LABEL
public static final java.lang.String NOITEMS_LABEL
protected boolean verbose
public Dictionary()
Dictionary
in read-only mode with default
DictProperties ("dictionary.properties" in current directory or,
failing that, hardcoded defaults).public Dictionary(DictProperties dp)
Dictionary
in read-only mode with
DictProperties dp.public Dictionary(boolean write)
Dictionary
.write
- a boolean
value: false opens the
dictionary in read-only mode; true opens it for writing (enabling
creation of new tables). Use default DictProperties
("dictionary.properties" in current directory or, failing that,
hardcoded defaults).public Dictionary(boolean write, DictProperties dp)
Dictionary
.write
- a boolean
value: false opens the
dictionary in read-only mode; true opens it for writing (enabling
creation of new tables). Use default DictProperties dp.public void init(boolean write)
public int addToDictionary(TokenMap tm, java.lang.String fou) throws AlreadyIndexedException, EmptyFileException
tm
- a TokenMap
: multiset of tokensfou
- a String
: the file whose TokenMap
is tmAlreadyIndexedException
- if an error occursEmptyFileException
public void removeFromDictionary(java.lang.String fou) throws NotIndexedException
removeFromDictionary
de-indexes file or URL
fou
N.B.: currently, removeFromDictionary operations aren't atomic;
if the program crashes the index could be left in an inconsistent
state. In future, implement it using JE transactionsfou
- a String
valueNotIndexedException
- if an error occurspublic boolean isIndexed(java.lang.String fou)
fou
is in the index.fou
- a String
valueboolean
valuepublic int getFileKey(java.lang.String fou)
public FileTable getFileTable()
public java.lang.String[] getIndexedFileNames()
public int[] getIndexedFileKeys()
public java.lang.String getIndexedFileName(int k)
public FrequencyHash getFileFrequencyTable(int fno, boolean nocase)
public com.sleepycat.je.Environment getEnvironment()
public int getFrequency(WordForms wforms)
public int getFrequency(WordForms wforms, SubcorpusConstraints sbc)
public CaseTable getCaseTable()
public boolean matchConcordance(PrepContextQuery pcq, int pos, int[] posa)
matchConcordance
match cline
against
this query (represented after parseQuery()
by
queryArray
and intervArray
)
N.B.: used binary search in order to speed it up to logarithmic levels (avoiding the current O(n) worst-case behaviour)
pcq
- the 'pre-processed' query object, containing the
search horizons (Horizon
)
objects, and a set of byte offsets per wordform for a specific
file (WordPositionTable
).pos
- the position (as byte offset) of the keyword on the fileposa
- the entire word position array for a specified file
(obtained through TPosTable.getPosArray(int)
).true
if cline matches, false otherwise.public java.util.Vector getAllFileNames(java.lang.String key)
key
- the keyword to search forpublic void printCorpusStats(java.io.PrintWriter os)
public void printCorpusStats(java.io.PrintWriter os, boolean nocase)
public static final void printSubCorpusStats(java.io.PrintWriter os, int notypes, int notokens)
public void printNoItems(java.io.PrintWriter os, int noitems)
public int getTotalNoOfTokens()
public double getTypeTokenRatio(boolean nocase)
public double getTypeTokenRatio()
public static final double getTypeTokenRatio(int notypes, int notokens)
public void printSortedFreqList(java.io.PrintWriter os)
os
- a PrintWriter
valuepublic void printSortedFreqList(java.io.PrintWriter os, int max)
os
- a PrintWriter
valuemax
- an int
valuepublic void printSortedFreqList(java.io.PrintWriter os, int from, int max, boolean nocase)
os
- a PrintWriter
valuemax
- an int
valuepublic void printSortedFreqList(java.io.PrintWriter os, int from, int max, SubcorpusConstraints sbc, boolean nocase)
os
- a PrintWriter
valuemax
- an int
valuesbc
- a SubcorpusConstraints
valuepublic java.lang.String getCorpusDir()
public void printConcordances(WordQuery query, int ctx, boolean ignx, java.io.PrintWriter os)
public void printConcordances(WordQuery query, int ctx, boolean ignx, java.io.PrintWriter os, SubcorpusConstraints sbc)
public java.lang.String getExtract(java.lang.String fn, int ctx, long offset, boolean ignx)
public DictProperties getDictProps()
public IntegerSet getOccurringFiles(java.lang.String word)
word
- a String
the query word (type)IntegerSet
the set of keys to files in
which word occurspublic void dump()
public void sync()
public void compress()
public void cleanLog()
public void close()
public void finalize()
finalize
in class java.lang.Object
public LogStream getLogStream()
public boolean getVerbose()
public void setVerbose(boolean v)