|
| 1 | +package edu.umass.cs.iesl.apassos |
| 2 | + |
| 3 | +import cc.factorie.app.nlp._ |
| 4 | +import cc.factorie.app.nlp.load._ |
| 5 | +import cc.factorie.app.nlp.mention.{MentionType, Mention, MentionList} |
| 6 | +import cc.factorie.app.nlp.coref.{EntityKey, ConllCorefLoader} |
| 7 | +import cc.factorie.app.nlp.hcoref.{EntityVariable, EntityRef} |
| 8 | + |
| 9 | +/** |
| 10 | + * User: apassos |
| 11 | + * Date: 10/7/13 |
| 12 | + * Time: 2:20 PM |
| 13 | + */ |
| 14 | + |
| 15 | + |
| 16 | + |
| 17 | +object Lecture5AceLoader { |
| 18 | + def printMentions(aceDoc: Document): Unit = { |
| 19 | + val mentionList = aceDoc.attr[MentionList] |
| 20 | + val entityMap = collection.mutable.HashMap[String,Int]() |
| 21 | + val startToMentionMap = mentionList.map(m => (m.section, m.start) -> m).groupBy(_._1).toMap |
| 22 | + val endToMentionMap = mentionList.map(m => (m.section,m.end-1) -> m).groupBy(_._1).toMap |
| 23 | + var count = 0 |
| 24 | + for (t <- aceDoc.tokens) { |
| 25 | + var s = "" |
| 26 | + if (startToMentionMap.contains((t.section,t.positionInSection))) { |
| 27 | + for ((_,m) <- startToMentionMap((t.section,t.positionInSection))) s += "["+m.attr[MentionType].categoryValue+" " |
| 28 | + } |
| 29 | + s += t.string |
| 30 | + if (endToMentionMap.contains((t.section,t.positionInSection))) { |
| 31 | + for ((_,m) <- endToMentionMap((t.section,t.positionInSection))) { |
| 32 | + s += "]" |
| 33 | + val id = entityMap.getOrElseUpdate(m.attr[EntityRef].entity.string, entityMap.size) |
| 34 | + s += s"($id)" |
| 35 | + } |
| 36 | + } |
| 37 | + if (t.hasNext && !t.next.string.matches("\\.|,|;|\\?")) s += " " |
| 38 | + count += s.length |
| 39 | + print(s) |
| 40 | + if (count >= 70) { count = 0; println() } |
| 41 | + } |
| 42 | + } |
| 43 | + |
| 44 | + def main(args: Array[String]): Unit = { |
| 45 | + val aceDoc = LoadACE.fromApf("/iesl/data/ldc/LDC2006T06/data/english/bc/fp1/cnn_cf_20030303.1900.00.apf.xml") |
| 46 | + val aceMentions = aceDoc.attr[ACEMentionSpanList] |
| 47 | + val mentionList = aceDoc.attr += new MentionList |
| 48 | + aceMentions.foreach(a => { |
| 49 | + val m = new Mention(a.section, a.start, a.length, a.length-1) |
| 50 | + mentionList += m |
| 51 | + m.attr += new MentionType(m, a.attr[ACEMentionIdentifiers].mType) |
| 52 | + m.attr += a.attr[EntityRef] |
| 53 | + }) |
| 54 | + printMentions(aceDoc) |
| 55 | + } |
| 56 | +} |
| 57 | + |
| 58 | +object Lecture5OntonotesLoader { |
| 59 | + def processEntity(name: String): String = { |
| 60 | + val s = name.split("-") |
| 61 | + if (s.length == 2) s.last else "" |
| 62 | + } |
| 63 | + |
| 64 | + def printMentions(aceDoc: Document): Unit = { |
| 65 | + val mentionList = aceDoc.attr[MentionList] |
| 66 | + val startToMentionMap = mentionList.map(m => (m.section, m.start) -> m).groupBy(_._1).toMap |
| 67 | + val endToMentionMap = mentionList.map(m => (m.section,m.end-1) -> m).groupBy(_._1).toMap |
| 68 | + var count = 0 |
| 69 | + for (t <- aceDoc.tokens) { |
| 70 | + var s = "" |
| 71 | + if (startToMentionMap.contains((t.section,t.positionInSection))) { |
| 72 | + for ((_,m) <- startToMentionMap((t.section,t.positionInSection))) s += "[" |
| 73 | + } |
| 74 | + s += t.string |
| 75 | + if (endToMentionMap.contains((t.section,t.positionInSection))) { |
| 76 | + for ((_,m) <- endToMentionMap((t.section,t.positionInSection))) s += "]"+processEntity(m.attr[EntityKey].name) |
| 77 | + } |
| 78 | + if (t.hasNext && !t.next.string.matches("\\.|,|;|\\?")) s += " " |
| 79 | + count += s.length |
| 80 | + print(s) |
| 81 | + if (count >= 70) { count = 0; println() } |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + def main(args: Array[String]): Unit = { |
| 86 | + val ontonotesDoc = "/iesl/canvas/mccallum/data/conll2011/conll-train-clean.txt" |
| 87 | + val docs = ConllCorefLoader.loadWithParse(ontonotesDoc) |
| 88 | + printMentions(docs.head) |
| 89 | + } |
| 90 | +} |
0 commit comments