Skip to content

Commit 4de4c27

Browse files
committed
adding example code to browse pos tags in wsj data
1 parent 49c633a commit 4de4c27

File tree

1 file changed

+83
-0
lines changed

1 file changed

+83
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package edu.umass.cs.iesl.apassos
2+
3+
import cc.factorie.app.nlp.{TokenSpan, Token, LoadOntonotes5}
4+
5+
import cc.factorie._
6+
/**
7+
* User: apassos
8+
* Date: 9/23/13
9+
* Time: 11:12 AM
10+
*/
11+
12+
object Lecture2POSFilters {
13+
14+
def tokensToPhrase(ts: Iterable[Token]): String = new TokenSpan(ts.head.section, ts.head.positionInSection, ts.size).phrase.replace("\n", " ")
15+
16+
def main(args: Array[String]) {
17+
implicit val rng = new scala.util.Random(0)
18+
val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head
19+
val tokens = trainDoc.tokens.toSeq.shuffle
20+
for (tag <- app.nlp.pos.PennPosDomain.categories) {
21+
println(s"Tag: $tag")
22+
for (tok <- tokens.filter(_.posLabel.categoryValue == tag).take(10)) {
23+
println(f"${tokensToPhrase(tok.prevWindow(4))}%40s (${tok.string}%12s) ${tokensToPhrase(tok.nextWindow(4))}")
24+
}
25+
println()
26+
}
27+
}
28+
}
29+
30+
object Lecture2POSFiltersPOS {
31+
32+
def tokensToPhrase(ts: Iterable[Token]): String = ts.map(_.posLabel.categoryValue).mkString(" ")
33+
34+
def main(args: Array[String]) {
35+
implicit val rng = new scala.util.Random(0)
36+
val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head
37+
val tokens = trainDoc.tokens.toSeq.shuffle
38+
for (tag <- app.nlp.pos.PennPosDomain.categories) {
39+
println(s"Tag: $tag")
40+
for (tok <- tokens.filter(_.posLabel.categoryValue == tag).take(10)) {
41+
println(f"${tokensToPhrase(tok.prevWindow(4))}%40s (${tok.string}%12s) ${tokensToPhrase(tok.nextWindow(4))}")
42+
}
43+
println()
44+
}
45+
}
46+
}
47+
48+
object Lecture2POSFiltersGrep {
49+
50+
def tokensToPhrase(ts: Iterable[Token]): String = ts.map(t => s"${t.string}/${t.posLabel.categoryValue}").mkString(" ")
51+
52+
def main(args: Array[String]) {
53+
implicit val rng = new scala.util.Random(0)
54+
val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head
55+
val tokens = trainDoc.tokens.toSeq.shuffle
56+
println("Type the word")
57+
val example = readLine()
58+
for (tok <- tokens.filter(_.string.toLowerCase == example)) {
59+
println(f"${tokensToPhrase(tok.prevWindow(4))}%60s (${tok.string}%12s)/${tok.posLabel.categoryValue} ${tokensToPhrase(tok.nextWindow(4))}")
60+
}
61+
}
62+
}
63+
64+
65+
object Lecture2POSFiltersContext {
66+
67+
def tokensToPhrase(ts: Iterable[Token]): String = ts.map(t => s"${t.string}/${t.posLabel.categoryValue}").mkString(" ")
68+
69+
def getContext(t: Token): (String,String) = if (t.hasPrev && t.hasNext) (t.prev.posLabel.categoryValue,t.next.posLabel.categoryValue) else ("","")
70+
71+
def main(args: Array[String]) {
72+
implicit val rng = new scala.util.Random(0)
73+
val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head
74+
val tokens = trainDoc.tokens.toSeq.shuffle
75+
for (t <- tokens.take(20)) {
76+
println(s"Token ${t.string} with tag ${t.posLabel.categoryValue}. Context: ${t.prev.posLabel.categoryValue} _ ${t.next.posLabel.categoryValue}")
77+
for (tok <- tokens.shuffle.filter(tt => getContext(tt) == getContext(t)).take(20)) {
78+
println(f"${tokensToPhrase(tok.prevWindow(4))}%60s (${tok.string}%12s)/${tok.posLabel.categoryValue} ${tokensToPhrase(tok.nextWindow(4))}")
79+
}
80+
println()
81+
}
82+
}
83+
}

0 commit comments

Comments
 (0)