modnlp.util
public class Tokeniser extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
protected java.lang.String |
encoding |
protected java.lang.Boolean |
indexPuntuation |
protected java.lang.String |
originalText |
static char[] |
SEPTKARR |
static java.lang.String |
SEPTOKEN |
protected boolean |
tagIndexing |
protected TokenMap |
tokenMap |
protected boolean |
verbose |
Constructor and Description |
---|
Tokeniser(java.io.File file,
java.lang.String e) |
Tokeniser(java.lang.String text) |
Tokeniser(java.net.URL url,
java.lang.String e) |
Modifier and Type | Method and Description |
---|---|
static java.lang.String |
disbar(java.lang.String token)
Disbar token
|
static java.lang.String |
fixType(java.lang.String type)
Delete dots (e.g.
|
java.lang.String |
getEncoding() |
java.lang.Boolean |
getIndexPuntuation()
Gets the value of indexPuntuation
|
java.lang.String |
getOriginalText() |
boolean |
getTagIndexing() |
TokenIndex |
getTokenIndex(java.lang.String str) |
TokenMap |
getTokenMap() |
boolean |
getVerbose() |
static boolean |
isBar(java.lang.String token)
Check is token is a negated token (e.g '-c' in p(t|-c))
|
void |
setEncoding(java.lang.String v) |
void |
setIgnoredElements(java.lang.String i) |
void |
setIndexPuntuation(java.lang.Boolean argIndexPuntuation)
Sets the value of indexPuntuation
|
void |
setTagIndexing(boolean v) |
void |
setTokenMap(TokenMap t) |
void |
setVerbose(boolean v) |
java.util.List<java.lang.String> |
split(java.lang.String str) |
java.util.List<java.lang.String> |
splitWordOnly(java.lang.String str) |
void |
tokenise()
tokenise : Very basic tokenisation; Serious tokenisers
must override this method. |
protected boolean tagIndexing
protected boolean verbose
protected java.lang.String originalText
protected TokenMap tokenMap
protected java.lang.String encoding
public static final char[] SEPTKARR
public static final java.lang.String SEPTOKEN
protected java.lang.Boolean indexPuntuation
public Tokeniser(java.lang.String text)
public Tokeniser(java.net.URL url, java.lang.String e) throws java.io.IOException
java.io.IOException
public Tokeniser(java.io.File file, java.lang.String e) throws java.io.IOException
java.io.IOException
public final java.lang.Boolean getIndexPuntuation()
public void setIndexPuntuation(java.lang.Boolean argIndexPuntuation)
argIndexPuntuation
- Value to assign to this.indexPuntuationpublic void setTokenMap(TokenMap t)
public boolean getTagIndexing()
public void setTagIndexing(boolean v)
public boolean getVerbose()
public void setVerbose(boolean v)
public void setIgnoredElements(java.lang.String i)
public java.lang.String getEncoding()
public void setEncoding(java.lang.String v)
public TokenMap getTokenMap()
public java.lang.String getOriginalText()
public void tokenise() throws java.io.IOException
tokenise
: Very basic tokenisation; Serious tokenisers
must override this method. Note that positions in the tokenMap
here correspond to the ORDER in which the token appears in
originalText not its actual OFFSET.java.io.IOException
for a proper
implementation.
public java.util.List<java.lang.String> split(java.lang.String str)
public java.util.List<java.lang.String> splitWordOnly(java.lang.String str)
public TokenIndex getTokenIndex(java.lang.String str)
public static java.lang.String fixType(java.lang.String type)
public static boolean isBar(java.lang.String token)
public static java.lang.String disbar(java.lang.String token)