|
| 1 | +package edu.umass.cs.iesl.apassos |
| 2 | + |
| 3 | +import cc.factorie.app.nlp.{TokenSpan, Token, LoadOntonotes5} |
| 4 | + |
| 5 | +import cc.factorie._ |
| 6 | +/** |
| 7 | + * User: apassos |
| 8 | + * Date: 9/23/13 |
| 9 | + * Time: 11:12 AM |
| 10 | + */ |
| 11 | + |
| 12 | +object Lecture2POSFilters { |
| 13 | + |
| 14 | + def tokensToPhrase(ts: Iterable[Token]): String = new TokenSpan(ts.head.section, ts.head.positionInSection, ts.size).phrase.replace("\n", " ") |
| 15 | + |
| 16 | + def main(args: Array[String]) { |
| 17 | + implicit val rng = new scala.util.Random(0) |
| 18 | + val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head |
| 19 | + val tokens = trainDoc.tokens.toSeq.shuffle |
| 20 | + for (tag <- app.nlp.pos.PennPosDomain.categories) { |
| 21 | + println(s"Tag: $tag") |
| 22 | + for (tok <- tokens.filter(_.posLabel.categoryValue == tag).take(10)) { |
| 23 | + println(f"${tokensToPhrase(tok.prevWindow(4))}%40s (${tok.string}%12s) ${tokensToPhrase(tok.nextWindow(4))}") |
| 24 | + } |
| 25 | + println() |
| 26 | + } |
| 27 | + } |
| 28 | +} |
| 29 | + |
| 30 | +object Lecture2POSFiltersPOS { |
| 31 | + |
| 32 | + def tokensToPhrase(ts: Iterable[Token]): String = ts.map(_.posLabel.categoryValue).mkString(" ") |
| 33 | + |
| 34 | + def main(args: Array[String]) { |
| 35 | + implicit val rng = new scala.util.Random(0) |
| 36 | + val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head |
| 37 | + val tokens = trainDoc.tokens.toSeq.shuffle |
| 38 | + for (tag <- app.nlp.pos.PennPosDomain.categories) { |
| 39 | + println(s"Tag: $tag") |
| 40 | + for (tok <- tokens.filter(_.posLabel.categoryValue == tag).take(10)) { |
| 41 | + println(f"${tokensToPhrase(tok.prevWindow(4))}%40s (${tok.string}%12s) ${tokensToPhrase(tok.nextWindow(4))}") |
| 42 | + } |
| 43 | + println() |
| 44 | + } |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +object Lecture2POSFiltersGrep { |
| 49 | + |
| 50 | + def tokensToPhrase(ts: Iterable[Token]): String = ts.map(t => s"${t.string}/${t.posLabel.categoryValue}").mkString(" ") |
| 51 | + |
| 52 | + def main(args: Array[String]) { |
| 53 | + implicit val rng = new scala.util.Random(0) |
| 54 | + val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head |
| 55 | + val tokens = trainDoc.tokens.toSeq.shuffle |
| 56 | + println("Type the word") |
| 57 | + val example = readLine() |
| 58 | + for (tok <- tokens.filter(_.string.toLowerCase == example)) { |
| 59 | + println(f"${tokensToPhrase(tok.prevWindow(4))}%60s (${tok.string}%12s)/${tok.posLabel.categoryValue} ${tokensToPhrase(tok.nextWindow(4))}") |
| 60 | + } |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | + |
| 65 | +object Lecture2POSFiltersContext { |
| 66 | + |
| 67 | + def tokensToPhrase(ts: Iterable[Token]): String = ts.map(t => s"${t.string}/${t.posLabel.categoryValue}").mkString(" ") |
| 68 | + |
| 69 | + def getContext(t: Token): (String,String) = if (t.hasPrev && t.hasNext) (t.prev.posLabel.categoryValue,t.next.posLabel.categoryValue) else ("","") |
| 70 | + |
| 71 | + def main(args: Array[String]) { |
| 72 | + implicit val rng = new scala.util.Random(0) |
| 73 | + val trainDoc = LoadOntonotes5.fromFilename("/iesl/canvas/mccallum/data/ontonotes-en-1.1.0/trn-pmd/nw-wsj-trn.dep.pmd").head |
| 74 | + val tokens = trainDoc.tokens.toSeq.shuffle |
| 75 | + for (t <- tokens.take(20)) { |
| 76 | + println(s"Token ${t.string} with tag ${t.posLabel.categoryValue}. Context: ${t.prev.posLabel.categoryValue} _ ${t.next.posLabel.categoryValue}") |
| 77 | + for (tok <- tokens.shuffle.filter(tt => getContext(tt) == getContext(t)).take(20)) { |
| 78 | + println(f"${tokensToPhrase(tok.prevWindow(4))}%60s (${tok.string}%12s)/${tok.posLabel.categoryValue} ${tokensToPhrase(tok.nextWindow(4))}") |
| 79 | + } |
| 80 | + println() |
| 81 | + } |
| 82 | + } |
| 83 | +} |
0 commit comments