diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f6ca7ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +logs +project/project +project/target +target +tmp +.history +dist \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..6373f4a --- /dev/null +++ b/README @@ -0,0 +1,4 @@ +This is your new Play 2.0 application +===================================== + +This file will be packaged with your application, when using `play dist`. diff --git a/app/controllers/Application.scala b/app/controllers/Application.scala new file mode 100644 index 0000000..1416387 --- /dev/null +++ b/app/controllers/Application.scala @@ -0,0 +1,51 @@ +package controllers + +import play.api._ +import play.api.mvc._ +import java.io.File +import paper.Analyze +import generators._ + +object Application extends Controller { + + def index = Action { + Ok(views.html.index("")) + } + + def generateSchedulePdf = Action { request => + val body = request.body.asFormUrlEncoded + if(body != None) { + val papers = body.get("papers[]").toList + val abstractGet = body.get("abstract") + + val abstractVal = if(abstractGet.length == 1) abstractGet.toList.head.toInt else 0 + + Ok(SchedulePdfGenerator.apply(papers, abstractVal)) + } else Ok("Something went wrong") + } + + def generateTaskProcess(task: String, path: String) = Action { + Ok(TaskProcessGenerator.apply(task, utf8URLDecode(path))) + } + + def generatePersonalGraph(link: String) = Action { + Ok(views.html.index(link)) + } + + def generateLink = Action { request => + val body = request.body.asFormUrlEncoded + if(body != None) { + val email = body.get("useremail").toList + } + // TODO + Redirect(routes.Application.index) + } + + def getGraphData(link: String) = Action { + Ok("") // TODO + } + + private def utf8URLDecode(url: String): String = { + return """/%u([0-9a-f]{3,4})/i""".r.replaceAllIn(java.net.URLDecoder.decode(url.replace("\\00", "%u00"), "UTF-8"), "&#x\\1;") + } +} \ No newline at end of file diff --git a/lib/jquery/jquery.js~ b/app/generators/LinkGenerator.scala old mode 100755 new mode 100644 similarity index 100% rename from lib/jquery/jquery.js~ rename to app/generators/LinkGenerator.scala diff --git a/app/generators/SchedulePdfGenerator.scala b/app/generators/SchedulePdfGenerator.scala new file mode 100644 index 0000000..6f1bd17 --- /dev/null +++ b/app/generators/SchedulePdfGenerator.scala @@ -0,0 +1,214 @@ +package generators + +import play.api._ +import play.api.mvc._ +import play.i18n.Lang +import play.api.libs.json._ +import scala.io.Source +import java.io.File +import java.security.MessageDigest + + +case class Paper(id: String, title: String, authors: String, pdf: String, date: String, room: String) { + def getId: String = this.id + def getTitle: String = this.title + def getAuthors: String = this.authors + def getPdf: String = this.pdf + def getDate: String = this.date + def getRoom: String = this.room + + def setId(newId: String): Paper = Paper(newId, title, authors, pdf, date, room) + def setTitle(newTitle: String): Paper = Paper(id, newTitle, authors, pdf, date, room) + def setAuthors(newAuthors: String): Paper = Paper(id, title, newAuthors, pdf, date, room) + def setPdf(newPdf: String): Paper = Paper(id, title, authors, newPdf, date, room) + def setDate(newDate: String): Paper = Paper(id, title, authors, pdf, newDate, room) + def setRoom(newRoom: String): Paper = Paper(id, title, authors, pdf, date, newRoom) +} + +object PaperCreator { + def createPaper(json: JsValue): Paper = { + Paper((json \ "id").toString, (json \ "title").toString.drop(1).dropRight(1), (json \ "authors").toString.drop(1).dropRight(1), (json \ "pdf").toString.drop(1).dropRight(1), (json \ "date").toString.drop(1).dropRight(1), (json \ "room").toString.drop(1).dropRight(1)) + } +} + +object SchedulePdfGenerator { + def apply(selectedPapers: List[String], abstractVal: Int): String = { + val json: JsValue = Json.parse(Source.fromFile("public/js/data.json").mkString) + val nodes = (json \\ "nodes").head + + // Dividing to get seconds in place of milliseconds + //date_default_timezone_set ( "UTC" ); + val papers = changeDate(getPapers(nodes, selectedPapers)) + + + // Get Temporary directory + val postfix = md5(Math.random.toInt.toString).substring(0,7) + val directory = "public/tex/temp/" + postfix + (new File(directory)).mkdir + + val tempContent = createSchedule(abstractVal, papers, directory) + val tempOutput = directory + "/isit_schedule" + + // Create system call for generating pdf + val call = "cat public/tex/header.tex " + tempContent + " public/tex/footer.tex | pdflatex --jobname " + tempOutput + + /* + // Make system call + $ret = exec($call); + + // Create temporary page "your download should appear in a moment. If not click here" + echo page_output($temp_output); + + // Redirect to schedule + header( 'Location: '.$temp_output.'.pdf' ) ; +*/ "" + } + + private def md5(s: String): String = { + var algorithm = MessageDigest.getInstance("MD5") + algorithm.update(s.getBytes) + algorithm.digest.toString + } + + private def changeDate(papers: List[Paper]): List[Paper] = { + def changeDate0(papers: List[Paper], accu: List[Paper]): List[Paper] = papers match { + case List() => accu + case x::xs => changeDate0(xs, x.setDate(x.getDate.dropRight(3)) :: accu) + } + + changeDate0(papers, List()).reverse + } + + /** + * Creates the content part of the schedule file + */ + private def createSchedule(abstractVal: Int, papers: List[Paper], directory: String): String = { + + // Sorts papers by date + //uasort($papers, "sort_by_date"); + + // Schedule and date + var schedule = "" + var date = "" + var time = "" + var first = true + + // Now for each paper, write it out + papers.foreach (p => { + // Check if the date is the same as before + val newDate = ""//date('l \t\h\e jS \of F Y', p.getDate) + val newTime = ""//date('h:i A', p.getDate) + + // Add day + if (!newDate.equals(date)) { + if (!first) schedule = schedule + endList + + schedule = schedule + addScheduleDay(newDate) + startList + first = false + } + + // Add schedule point + schedule = if (newTime.equals(time)) schedule + addSchedulePoint(abstractVal, p, "") + else schedule + addSchedulePoint(abstractVal, p, newTime) + + // Add schedule point and update date + date = newDate + time = newTime + } + ) + + // Save schedule to file + val file = directory + "/content.tex" + val fileObj = new File(file) + fileObj.createNewFile + val p = new java.io.PrintWriter(fileObj) + + p.print(schedule) + p.close + + file + } + + /** + * Constructs the latex code for a single paper in the schedule + */ + private def addSchedulePoint(abstractVal:Int, p: Paper, time: String): String = { + + // Get abstract + var abstr = "" + if (abstractVal == 1) { + abstr = TaskProcessGenerator.apply("abstract", p.getPdf) + abstr = latexSpecialChars(abstr) + abstr = "{\\small " + abstr + "} \n" + } + + // Get title + val title = "{\\it " + latexSpecialChars( p.getTitle/*, "\\'\"&\n\r{}[]"*/ ) + "} \\\\ \n"; + + // Get time and place + val point = if (time.equals("")) "\\item[{\\hfill " + p.getRoom + "}]\n" + else "\\item[{\\hfill \bf " + time + "} \\\\ {\\hfill " + p.getRoom + "}]\n" + + // Get authors + val authors = "{" + p.getAuthors + "} \\\\ \n" + + point + title + authors + abstr + "\n" + } + + private def startList: String = { + "%\n%\n\\begin{enumerate}[leftmargin=5cm, labelsep=0.3cm, rightmargin=2cm, align=right, itemsep=1cm, style=multiline]\n%\n" + } + + private def endList: String = { + "\\end{enumerate}\n%\n" + } + + /** + * Constructs the latex code for a page dedicated to a certain day + */ + private def addScheduleDay(date: String) { + "\\clearpage\\period{" + date + "}\\hfil\\break \\\\ \n" + } + + /** + * Sorts two items based on their date + */ + private def sortByDate(a: Paper, b: Paper): Int = { + return if(a.getDate < b.getDate) -1 else 1 + } + + + /** + * Displays a field for downloading the schedule + */ + private def pageOutput(tempOutput: String): String = { + "TrailHead

If your download didn't start automatically, click here

" + } + + private def latexSpecialChars( str: String ): String = { + val map = Map( + "#"->"\\#", + "$"->"\\$", + "%"->"\\%", + "&"->"\\&", + "~"->"\\~{}", + "_"->"\\_", + "^"->"\\^{}", + "\\"->"\\textbackslash", + "{"->"\\{", + "}"->"\\}" + ) + return """([\^\%~\\\\#\$%&_\{\}])""".r.replaceAllIn(str, m => map(m.group(1))) + } + + private def getPapers(array: JsValue, selectedPapers: List[String]): List[Paper] = { + var output: List[Paper] = List[Paper]() + + selectedPapers.foreach((id: String) => { + val paper = PaperCreator.createPaper(array.apply(id.toInt)) + output = paper.setId(id) :: output + }) + + output.reverse + } +} \ No newline at end of file diff --git a/app/generators/TaskProcessGenerator.scala b/app/generators/TaskProcessGenerator.scala new file mode 100644 index 0000000..1ff4ef2 --- /dev/null +++ b/app/generators/TaskProcessGenerator.scala @@ -0,0 +1,46 @@ +package generators + +import play.api._ +import play.api.mvc._ +import play.i18n.Lang +import scala.io.Source +import java.io.File +import paper._ + +object TaskProcessGenerator { + def apply(task: String, path: String): String = { + return if(task.equals("abstract")) getAbstract(path) else "Task not recognized" + } + + private def getAbstract(pdf: String): String = { + val array = List[(String, String)](("û", "fi"), (""d", "≤"), ("ï¬", "fi"), ("≤", "≤"), ("º", "κ"), + ("´", "δ"), (""¥", "⊥"), (""H", "≈"), (""", "♣"), ("³", "γ"), ("Á", "ρ"), + ("Ä", "τ"), ("»", "λ")) + + + val files = Analyze.main(Array[String]("public/" + pdf, "-p")) + + if(files.isEmpty) return "Can't load the abstract" + else return replaceWithList(files.head.getAbstract.getText, array) + } + + private def replaceWithList(str: String, array: List[(String, String)]): String = { + var output = str + + array.foreach((l: (String, String)) => output = output.replace(l._1, l._2)) + + output + } + + private def repeatStr(str: String, nb: Int): String = { + var output = "" + var i = 0 + + while(i < nb) { + output = output + str + i = i + 1 + } + + output + } +} \ No newline at end of file diff --git a/app/paper/Analyze.scala b/app/paper/Analyze.scala new file mode 100644 index 0000000..2f63c8c --- /dev/null +++ b/app/paper/Analyze.scala @@ -0,0 +1,64 @@ +package paper + +object Analyze { + def main(args : Array[String]): List[Paper] = { + // create analyzer + val A : Analyzer = new Analyzer() + + // Check that a directory is supplied (there is an argument) + if (args.length == 0 || args.length > 2) {println("You should provide at leath a path and at most a path and an option. Type -h for help.");List()} + + else if(args.contains("-h")) { println("How to call: Analyze [path] [parameter]?\nPARAMETERS:\n\t-p : parsing\n\t-s : looks for xml scheduler\n\t-c : compare\n\t-e : extend\n\t-g : create graph\n\t-h : shows this help page\n\tnothing : do everything"); List()} + // Then go ahead + else A.analyze(args(0), args.toList.tail) + } +} + +class Analyzer extends Object with LoadPaper + with ParsePaper + with ExtendPaper + with ComparePaper + with XMLScheduleParser + with Graphs { + + // Set a limit in percent for when papers get an edge between them + val limit : Int = 1 + + // Get cached papers in this order + val cache : List[String] = List(Cache.linked, Cache.extended, Cache.scheduled, Cache.parsed) + + // Set sources we want to extend with + //val sources : List[PaperSource] = List(TalkDates, TalkRooms, PdfLink) + val sources : List[PaperSource] = List(PdfLink) + + // Analyze a paper + def analyze(paperPos: String, options: List[String]): List[Paper] = { + // Get a list of parsed papers + val papers : Option[List[Paper]] = if(options.isEmpty || options.contains("-p")) Some(loadAndParse(paperPos, cache, XMLParser, XMLConverterLoader)) else None + + // Mix in the schedule XML data + val xmlPapers : Option[List[Paper]] = if(options.isEmpty || options.contains("-s")) Some(getXMLSchedule(paperPos, papers)) else None + + // Extend papers with tertiary data + val extendedPapers : Option[List[Paper]] = if(options.isEmpty || options.contains("-e")) Some(extend(paperPos, xmlPapers, sources)) else None + + // Compare the papers individually + val comparedPapers : Option[List[Paper]] = if(options.isEmpty || options.contains("-c")) Some(compare(paperPos, extendedPapers, limit)) else None + + + if(options.isEmpty || options.contains("-g")){ + // Create graph + val graph : Graph = getGraph(paperPos, comparedPapers) + + // Print graph to file 'data.json' + graph.save + } + + if(options.contains("-p")) return papers.get + else if(options.contains("-s")) return xmlPapers.get + else if(options.contains("-e")) return extendedPapers.get + else if(options.isEmpty || options.contains("-c")) return comparedPapers.get + + List() + } +} diff --git a/app/paper/ComparePaper.scala b/app/paper/ComparePaper.scala new file mode 100644 index 0000000..c822db4 --- /dev/null +++ b/app/paper/ComparePaper.scala @@ -0,0 +1,51 @@ +package paper +import java.io._ +import scala.io.Source + +trait ComparePaper { + + def compare(paperPos:String, papers : Option[List[Paper]], limit : Int) : List[Paper] = { + println("BEGIN OF PAPERS COMPARISION") + val loadedPapers = if(papers == None) CacheLoader.load(paperPos, Cache.extended) else papers.get + + val finalPapers = loadedPapers.map(p => { + // Check that paper isn't already linked + if (p.meta.get("linked") == None) { + // Get list of papers that aren't current paper + val otherPapers = loadedPapers.filter(p != _) + + // Compare to every other paper + val weights : List[Int] = for (other <- otherPapers) yield getWeight(p, other) + + // Make links + //val links = for ((p,w) <- otherPapers.zip(weights) if w >= limit) yield Link(p.id,w) + val links = for ((p,w) <- otherPapers.zip(weights) if w >= 1) yield Link(p.index,w) + + // Add links to paper, and set it as linked + val result = p.setLinks(links).setMeta("linked", "yes") + + // Save result + Cache.save(result, Cache.linked) + + result + } + else p + }) + println("END OF PAPERS COMPARISION") + finalPapers + } + + def getWeight(p : Paper, o : Paper) : Int = { + // Get names + val pNames = p.getDistinctNames + val oNames = o.getDistinctNames + + // For each auther in p, check if he/she exists in other + var matches = (for (name <- pNames if oNames.contains(name)) yield 1).sum + + // return result + return (100 * matches.toDouble / pNames.length.toDouble).toInt + + } + +} diff --git a/app/paper/ExtendPaper.scala b/app/paper/ExtendPaper.scala new file mode 100644 index 0000000..989592f --- /dev/null +++ b/app/paper/ExtendPaper.scala @@ -0,0 +1,105 @@ +package paper + +abstract class PaperSource { + def getInfo(p : Paper) : String + def getLabel : String +} + +object TalkDates extends PaperSource { + + import scala.util.Random + import java.util.Date + import java.sql.Timestamp + import java.util.Calendar + + // TODO: This is just a temporary implementation + def getInfo(p : Paper) : String = { + + // Get Calendar and Random + var c = Calendar.getInstance + var r = new Random + + // Set starting point as tomorrow at 8 + c.add(Calendar.DAY_OF_MONTH, 1) + c.set(Calendar.HOUR_OF_DAY,8) + c.set(Calendar.MINUTE,0) + c.set(Calendar.SECOND,0) + c.set(Calendar.MILLISECOND,0) + + // Now add between zero and 6 hours + c.add(Calendar.HOUR, (r.nextDouble * 7).toInt) + // Add between 0 and 5 days + c.add(Calendar.DAY_OF_MONTH, (r.nextDouble * 6).toInt) + + // Get a timeStamp + var t = new Timestamp(c.getTime.getTime).getTime.toString + + //var n = new Timestamp(new Date().getTime).getTime + //var r = (n + (new Random().nextDouble * (60*60*24*4*1000)).toLong).toString + + return t + } + + def getLabel : String = "date" +} + + +// Adds the link of the pdf to the paper +object PdfLink extends PaperSource { + import scala.util.Random + + def getInfo(p : Paper) : String = { + + var f : String = p.meta("file") + var pdf : String = f.takeWhile(_!='.').concat(".pdf") + return pdf; + } + + def getLabel : String = "pdf" +} + + +object TalkRooms extends PaperSource { + import scala.util.Random + + // TODO: This is also just a temporary thing + def getInfo(p : Paper) : String = { + + // Return a random room between 1 and 10 + return (new Random().nextDouble * 10).toInt.toString + } + + def getLabel : String = "room" +} + + +/** Extend paper loops through a list of sources. Each source implements the + * interface paperSource and provides two methods: getLabel and getInfo. + * Get label returns the map label, while getInfo returns the particular information + */ +trait ExtendPaper { + + def extend(paperPos: String, papers : Option[List[Paper]], sources : List[PaperSource]) : List[Paper] = { + println("BEGIN OF PAPERS EXTENSION") + val loadedPapers = if(papers == None) CacheLoader.load(paperPos, Cache.scheduled) else papers.get + val finalPapers = loadedPapers.map(p => { + + var result : Paper = p + + // For each source, check if it's already added, and if not, add it + for (s <- sources if !p.hasMeta(s.getLabel)) { + result = result.setMeta(s.getLabel, s.getInfo(p)) + } + + // Save result + Cache.save(result, Cache.extended) + + // return result + result + }) + + println("END OF PAPERS EXTENSION") + finalPapers + } + +} diff --git a/app/paper/FileFormat.scala b/app/paper/FileFormat.scala new file mode 100644 index 0000000..b8e9604 --- /dev/null +++ b/app/paper/FileFormat.scala @@ -0,0 +1,72 @@ +package paper +import java.io.File + +object Paths { + private val toolsDirStr = "tools" + private val linuxDirStr = "linux" + private val windowsDirStr = "windows" + private val windowsSepStr = "\\" + private val linuxSepStr = "/" + private val windowsExtStr = ".exe" + private val linuxExtStr = "" + + + private def getOSDir: String = { if(SystemHelper.isWindows) windowsSepStr + windowsDirStr + windowsSepStr + else if(SystemHelper.isLinux) linuxSepStr + linuxDirStr + linuxSepStr + else "" } + + // Returns the tools directory according to the current operating system + def toolsDir = toolsDirStr + getOSDir + + // Returns the file extension according to the current operating system + def ext = if(SystemHelper.isWindows) windowsExtStr else if(SystemHelper.isLinux) linuxExtStr else "" + + // Returns the path separator according to the current operating system + def sep = if(SystemHelper.isWindows) windowsSepStr else if(SystemHelper.isLinux) linuxSepStr else "" +} + +// This class transforms a File object into another File object, according to the extension of the file +abstract class FileFormat { + def convertTo(format: String, params: List[String]): File = this match{ + case TXTFormat(file) => file + case PDFFormat(file) => { + val command = CommandDetector.detect(Paths.toolsDir + "pdfTo" + format + "Converter" + Paths.ext, format) + + val process: Process = sys.runtime.exec((List(command) ::: params ::: List(file.getAbsolutePath())).toArray[String]) + + // Waiting until the end of the command execution + if(process.waitFor() != 0) { println("Can't convert pdf file. Program will exit"); exit } + + new File(file.getParent() + Paths.sep + SystemHelper.name(file.getName) + "." + format) + } + } + + // Performs final procedures before the end of the file processing (mostly deleting intermediate files) + def releaseFile(newFile: File) = this match { + case TXTFormat(file) => + case PDFFormat(file) => newFile.delete() + } +} + +case class TXTFormat (file: File) extends FileFormat +case class PDFFormat (file: File) extends FileFormat + + +object FileFormatDispatcher { + // Determines which FileFormat should be used according to the extension of the file + def getFileFormat(file: File): FileFormat = { + SystemHelper.ext(file.getName()) match { + case "txt" => new TXTFormat(file) + case "pdf" => new PDFFormat(file) + } + } +} + +// This object handles special tool cases +object CommandDetector { + def detect(toolPath: String, format: String): String = { + if(format.equals("xml") && SystemHelper.isLinux) return "pdftohtml" + + toolPath + } +} \ No newline at end of file diff --git a/app/paper/FileLoader.scala b/app/paper/FileLoader.scala new file mode 100644 index 0000000..adaf8e3 --- /dev/null +++ b/app/paper/FileLoader.scala @@ -0,0 +1,112 @@ +package paper +import java.io.File +import scala.io.Source +import scala.io.BufferedSource + +// This object provides helpful methods for file and system managing +object SystemHelper { + private val supportedFormats = """pdf""" + private val os = sys.props.get("os.name") + + def ext(file: String) = """^.+?\.""".r.findFirstIn(file.reverse).get.dropRight(1).reverse + def name(file: String) = """^.+?\.""".r.replaceAllIn(file.reverse, "").reverse + def isSupported(format: String): Boolean = ("^" + supportedFormats + "$").r.findFirstIn(format).isDefined + + // Returns the files from a directory + def getFilesFromDirectory(orig: File): List[File] = { + if(orig.isDirectory()) orig.listFiles.filter(f => (""".+\.(""" + supportedFormats + """)$""").r.findFirstIn(f.getName).isDefined).toList + else Nil + } + + // The apps is on Windows + def isWindows: Boolean = os match { + case Some(s) => """.*Windows.*""".r.findFirstIn(s.toString()).isDefined + case None => false + } + + // The apps is on Linux + def isLinux: Boolean = os match { + case Some(s) => """.*Linux.*""".r.findFirstIn(s.toString()).isDefined + case None => false + } +} + +trait FileLoader { + def loadFromFile(file : File, p : Parsers) : Option[Paper] + + // This function sets the id and metadata before final returning + def setLastModifications(file: File, paper: Option[Paper]): Option[Paper] = { + if (paper != None) { + // Get index + try { + var id = (SystemHelper.name(file.getName())).toInt + + // Set filename and id + val finalPaper : Paper = paper.get.setMeta("file" -> file.getPath).setId(id) + + return Some(finalPaper) + }catch { + case _ => { println("The paper's name must contain only numerical values. Not parsed"); return None } + } + } + return None + } +} + +// This loader doesn't care about the format, it just passes the file to the parser +object SimpleLoader extends FileLoader { + def loadFromFile(file : File, p : Parsers) : Option[Paper] = { + println("parsing " + file.getPath + " using simple loader") + + val text = Source.fromFile(file) + + // actual parsing of the file content + val maybePaper : Option[Paper] = p.parse(text) + + // If paper exists and parsed, save it in cache + setLastModifications(file, maybePaper) + } +} + +// This class uses external tools in order to convert a particular format file into another before it is passed to the parser +abstract class ExternalLoader extends FileLoader { + // Converts a particular format file into another using an external tool + def loadFromFile(file : File, p : Parsers, format : String, params : List[String]) : Option[Paper] = { + // getFileFormat looks for other file formats (like pdf), then using external tools it extracts the content of the file and + // converts it to another format file + val fileFormat = FileFormatDispatcher.getFileFormat(file) + val newFile = fileFormat.convertTo(format, params) + val text = Source.fromFile(newFile) + + // actual parsing of the file content + val maybePaper : Option[Paper] = p.parse(text) + + // this method deletes temporary files that could have been previously created + fileFormat.releaseFile(newFile) + + // If paper exists and parsed, save it in cache + setLastModifications(file, maybePaper) + } +} + +// This object tries to convert the input file into txt before parsing +object TXTConverterLoader extends ExternalLoader { + override def loadFromFile(file : File, p : Parsers) : Option[Paper] = { + println("parsing " + file.getPath + " using txt loader") + // looking for the format and setting the correct parameters + loadFromFile(file, p, "txt", List("-enc", "UTF-8")) + } +} + +// This object converts a file into an xml one before parsing. Must use a parser that parses XML +object XMLConverterLoader extends ExternalLoader { + def loadFromFile(file : File, p : Parsers) : Option[Paper] = { + println("parsing " + file.getPath + " using xml loader") + + // looking for the format and setting the correct parameters + SystemHelper.ext(file.getName()) match { + case "txt" => loadFromFile(file, p, "txt", List("-enc", "UTF-8")) + case "pdf" => loadFromFile(file, p, "xml", List("-xml", "-q", "-enc", "UTF-8")) + } + } +} \ No newline at end of file diff --git a/app/paper/Graph.scala b/app/paper/Graph.scala new file mode 100644 index 0000000..0ce495e --- /dev/null +++ b/app/paper/Graph.scala @@ -0,0 +1,100 @@ +package paper + +trait Graphs { + + def getGraph(paperPos:String, papers : Option[List[Paper]]) : Graph = { + println("BEGIN OF GRAPH CREATION") + val loadedPapers = if(papers == None) CacheLoader.load(paperPos, Cache.linked) else papers.get + // Add all papers as nodes + val nodes : List[Node] = for (p <- loadedPapers) yield makeNode(p) + + // Then create all edges + val edges : List[Edge] = for (p <- loadedPapers; e <- makeEdges(p, nodes)) yield e + + println("END OF GRAPH CREATION") + return new Graph(nodes, edges) + } + + // MODIFICATION + def makeNode(paper : Paper) : Node = { + println("Making node for " + paper.id) + Node(paper.id, paper.meta("xmlpapertitle"), paper.meta("xmlauthors"), paper.meta("pdf"), paper.meta("xmldate"), paper.meta("xmlroom")) + } + + def makeEdges(paper : Paper, nodes : List[Node]) : List[Edge] = { + // make edge + val edges = for (link <- paper.links) yield (makeEdge(paper.index, link)) + // Sort edges by weight and pick the n biggest + val l = math.min(4, edges.length) + return edges.sortWith(_.weight > _.weight).take(l) + } + + def makeEdge(index : Int, link : Link) : Edge = Edge(index, link.index, link.weight) + +} + +class Graph(nodes : List[Node], edges : List[Edge]) { + + def save : Unit = { + val f = new java.io.File("data.json") + val p = new java.io.PrintWriter(f) + p.println(toString) + p.close + } + + override def toString : String = { + var ret : String = "{\n" + + // add nodes + ret += "\"nodes\":" + nodes.mkString("[\n ",",\n ","\n],") + "\n" + + // add edges + ret += "\"links\":" + edges.mkString("[\n ",",\n ","\n]") + "\n\n" + + // End + ret += "}" + + return ret + } +} + +case class Node(id : Int, title : String, authors : String, pdf : String, date : String, room : String) { + override def toString : String = { + var ret : String = "{" + ret += "\"id\":" + id + ",\n " + ret += "\"title\":\"" + Escape(title) + "\",\n " + ret += "\"authors\":\"" + Escape(authors) + "\",\n " + ret += "\"pdf\":\"" + Escape(pdf) + "\",\n " + ret += "\"date\":\"" + Escape(date) + "\",\n " + ret += "\"room\":\"" + Escape(room) + "\"" + ret += "}" + return ret + } +} + +case class Edge(from : Int, to : Int, weight : Int) { + override def toString : String = "{\"source\":" + from + ",\"target\":" + to + ",\"value\":" + weight + "}" +} + +/** Escapes a raw string for use in HTML.*/ +object Escape +{ + def apply(s: String) = + { + val out = new StringBuilder + for(i <- 0 until s.length) + { + s.charAt(i) match + { + case '>' => out.append(">") + case '&' => out.append("&") + case '<' => out.append("<") + case '"' => out.append(""") + case '\n' => out.append(" ") + case '\\' => out.append("\\\\") + case c => out.append(c) + } + } + out.toString + } +} diff --git a/app/paper/InformationExtractors.scala b/app/paper/InformationExtractors.scala new file mode 100644 index 0000000..03c356e --- /dev/null +++ b/app/paper/InformationExtractors.scala @@ -0,0 +1,180 @@ +package paper + +trait InformationExtractor { + // This method finds the first element in a list matching a particular condition and returns it with the rest of the list + protected def findFirstOf(list: List[XMLParagraph], condition: (XMLParagraph) => Boolean): List[XMLParagraph] = list match{ + case List() => Nil + case x::xs => if(condition(x)) x::xs else findFirstOf(xs, condition) + } + + // This method find the first element matching a condition and returns the previous ones (not matching) + protected def untilFirstOf(list: List[XMLParagraph], condition: (XMLParagraph) => Boolean): List[XMLParagraph] = { + def untilFirstOf0(l: List[XMLParagraph], accu: List[XMLParagraph]): List[XMLParagraph] = l match{ + case List() => accu.reverse + case x::xs => if(condition(x)) accu.reverse else untilFirstOf0(xs, x::accu) + } + + untilFirstOf0(list, List()) + } + + // This method filters the first elements matching a condition which are adjacent + protected def filterAdjacents(list: List[XMLParagraph], condition: (XMLParagraph) => Boolean): List[XMLParagraph] = { + def filterAdjacents0(l: List[XMLParagraph], accu: List[XMLParagraph]): List[XMLParagraph] = l match { + case List() => accu + case x::xs => if(condition(x)) filterAdjacents0(xs, x::accu) + else if(accu.length == 0) filterAdjacents0(xs, accu) else accu + } + + filterAdjacents0(list, List()).reverse + } + + // This method finds the first elements matching a condition which are adjacent and returns the list of the remaining ones (after these) + protected def discardAdjacents(list: List[XMLParagraph], condition: (XMLParagraph) => Boolean): List[XMLParagraph] = { + def discardAdjacents0(l: List[XMLParagraph], accu: Int): List[XMLParagraph] = l match { + case List() => Nil + case x::xs => if(condition(x)) discardAdjacents0(xs, accu + 1) + else if(accu == 0) discardAdjacents0(xs, accu) else x::xs + } + + discardAdjacents0(list, 0) + } + + // This method takes a list of strings and returns a string containing the concatenation of all the other strings + protected def concatText(list: List[String]): String = { + def concatText0(l: List[String], accu: String): String = l match { + case List() => if(accu.length() > 0) accu.dropRight(1) else accu // Last \n replacement + case x::xs => concatText0(xs, accu + x + "\n") + } + + concatText0(list, "") + } +} + + + + +trait AuthorsExtractor1 extends InformationExtractor{ + // This method extracts the authors of the article. + // It uses the following rules: + // - They are located in the first page between title and abstract + // - Every author paragraph contains the name at the top of it, i.e. the names are located at the first line of these paragraphs + def extractAuthors(paper: Paper, xml: XMLDocument, paragraphs: List[XMLParagraph]): (List[XMLParagraph], Paper) = { + val list = untilFirstOf(paragraphs, (p: XMLParagraph) => p.hasOption(XMLParagraphOptions.JUSTIFY) && p.getPosition.getWidth >= xml.getPage(1).getPosition.getWidth / 3) + val remainingList = paragraphs.drop(list.length) + + // Finding of the most at the top paragraphs + if(list.length == 0) return (paragraphs, paper) + val minTop = list.map((p:XMLParagraph) => p.getPosition.getY).min + val unprocessedAuthorsList = list.filter((p:XMLParagraph) => p.getPosition.getY == minTop) + + // Applying the extraction processing + val authors = unprocessedAuthorsList.flatMap((p:XMLParagraph) => ("""(.+?""" + ExtractionRegexes.authorsSeparator + """)|(.+?$)""").r.findAllIn(""" [0-9]+""".r.replaceAllIn(p.getLines.head.getText, ""))) + val authorsList = authors.map((s: String) => new Author(ExtractionRegexes.authorsSeparator.r.replaceAllIn(s, ""))) + + + if(authorsList.length != 0) (remainingList, paper.setAuthors(authorsList)) + else (paragraphs, paper) + + } +} + +trait AbstractExtractor1 extends InformationExtractor{ + // This method extracts the abstract of the article. + // It uses the following rules: + // - It is the first justified paragraph + // - It is entirely contained in the first page + def extractAbstract(paper: Paper, xml: XMLDocument, paragraphs: List[XMLParagraph]): (List[XMLParagraph], Paper) = { + val list = findFirstOf(paragraphs, (p: XMLParagraph) => p.hasOption(XMLParagraphOptions.JUSTIFY) && p.getPosition.getWidth >= xml.getPage(1).getPosition.getWidth / 3) + + if(list.length == 0) return (paragraphs, paper) + val xmlFont = xml.getFontsContainer.getXMLFont(list.head.getFontID) + val abstractList = filterAdjacents(list, (p: XMLParagraph) => xmlFont.get.checkID(p.getFontID) && p.hasOption(XMLParagraphOptions.JUSTIFY)).map((p: XMLParagraph) => p.getText.replaceAll("\n", " ")) + val remainingList = discardAdjacents(list, (p: XMLParagraph) => xmlFont.get.checkID(p.getFontID) && p.hasOption(XMLParagraphOptions.JUSTIFY)) + + if(abstractList.length != 0) (remainingList, paper.setAbstract(new Abstract(concatText(abstractList)))) + else (paragraphs, paper) + } +} + +trait TitleExtractor1 extends InformationExtractor { + // This method extracts the title of the article. + // It uses the following rules: + // - The title is contained in the first page + // - It has the biggest font size + def extractTitle(paper: Paper, xml: XMLDocument, paragraphs: List[XMLParagraph]): (List[XMLParagraph], Paper) = { + // This method finds the font id having the maximum size + def findMaxSizeID(paragraphs: List[XMLParagraph]): String = { + def findMaxSizeID0(paragraphs: List[XMLParagraph], max: Int, id: String): String = paragraphs match{ + case List() => id + case x::xs => + val newSize = xml.getFontsContainer.getXMLFont(x.getFontID).get.getSize.toInt + + if(newSize > max) findMaxSizeID0(xs, newSize, xml.getFontsContainer.getXMLFont(x.getFontID).get.getID) + else findMaxSizeID0(xs, max, id) + } + + findMaxSizeID0(paragraphs, 0, "") + } + + val maxSizeIDFont = xml.getFontsContainer.getXMLFont(findMaxSizeID(paragraphs.take(10))) + val title = findFirstOf(paragraphs, p => maxSizeIDFont.get.checkID(p.getFontID) || p.hasOption(XMLParagraphOptions.PAGE_CENTERED)) + + if(title.length != 0) (title.tail, paper.setTitle(new Title(title.head.getText.replace("\n", " ")))) + else (paragraphs, paper) + } +} + + +trait BodyExtractor1 extends InformationExtractor { + // This method extracts the body of the article. + // It uses the following rules: + // - It begins with the first justified paragraph after the abstract + // - It is justified and belongs to one column + def extractBody(paper: Paper, xml: XMLDocument, paragraphs: List[XMLParagraph]): (List[XMLParagraph], Paper) = { + val firstBodyList = findFirstOf(paragraphs, (p: XMLParagraph) => p.hasOption(XMLParagraphOptions.JUSTIFY) && !p.hasOption(XMLParagraphOptions.NO_COLUMN)) + if(firstBodyList.length == 0) return (paragraphs, paper) + val bodyXmlFont = xml.getFontsContainer.getXMLFont(firstBodyList.head.getFontID) + + // Filter (applying the rules) + val bodyListWithReturn = firstBodyList.filter((p: XMLParagraph) => bodyXmlFont.get.checkID(p.getFontID) && p.hasOption(XMLParagraphOptions.JUSTIFY) && !p.hasOption(XMLParagraphOptions.NO_COLUMN)) + + // Calculating the remaining list + val lastBodyParagraph = bodyListWithReturn.last + val bodyList = bodyListWithReturn.map((p: XMLParagraph) => p.getText.replaceAll("\n", " ")) + val remainingList = findFirstOf(paragraphs, (p:XMLParagraph) => p.getText.equals(lastBodyParagraph.getText)).tail + + if(bodyList.length != 0) (remainingList, paper.setBody(new Body(concatText(bodyList)))) + else (paragraphs, paper) + } +} + +trait ReferencesExtractor1 extends InformationExtractor { + // This method extracts the references of the article. + // It uses the following rules: + // - They begin with a paragraph containing a reference title + // - Each following paragraph is a reference. + // - The method recognizes the reference format and applies a suitable extraction strategy + def extractReferences(paper: Paper, xml: XMLDocument, paragraphs: List[XMLParagraph]): (List[XMLParagraph], Paper) = { + val refParagraphs = findFirstOf(paragraphs, (p: XMLParagraph) => ("""^""" + ExtractionRegexes.referencesName + """$""").r.findFirstIn(p.getText).isDefined) + val referencesStringList = if(!refParagraphs.isEmpty) refParagraphs.tail else List() + + // This method takes a list of reference paragraphs and process them + def makeReferences(refs: List[XMLParagraph], accu: List[Reference]): List[Reference] = refs match { + case List() => accu + case x::xs => { + // Using of the good reference processor after a structure recognition + val refExtr = ReferenceProcessorExecuter.extract(x.getText.replaceAll("\n", " ")) + val title = refExtr._1 + val authors = refExtr._2 + + if(title == None || authors == None) makeReferences(xs, accu) + else makeReferences(xs, (new Reference(authors.get, title.get)::accu)) + } + } + + val refList = makeReferences(referencesStringList, List()).reverse + + if(refList.length != 0) (List(), paper.setReferences(refList)) + else (paragraphs, paper) + } +} \ No newline at end of file diff --git a/app/paper/LoadPaper.scala b/app/paper/LoadPaper.scala new file mode 100644 index 0000000..b6828ab --- /dev/null +++ b/app/paper/LoadPaper.scala @@ -0,0 +1,232 @@ +package paper + +import scala.util.parsing.input._ +import scala.collection.immutable.Stream +import scala.io.Source +import java.io._ + + +object Cache { + + // Constants + val dir = "cache" + Paths.sep + val parsed = "parsed" + val extended = "extended" + val linked = "linked" + val scheduled = "scheduled" + val bad = "bad" + + import scala.io.Source + + def bad(file : File) : Unit = { + val f = new File(dir + file.getName + "." + bad) + // Make sure file exists + if(!f.exists) f.createNewFile + } + + def save(p : Paper, postfix : String) : Unit = { + val orig = new File(p.meta("file")) + val f = new File(dir + orig.getName + "." + postfix) + // Make sure file exists + if(!f.exists) f.createNewFile + val w = new PrintWriter(f) + + // Print paper + w.println("[[[ ID ]]]" + "\n" + p.id) + w.println("[[[ INDEX ]]]" + "\n" + p.index) + w.println("[[[ TITLE ]]]" + "\n" + p.title) + w.println("[[[ AUTHORS ]]]" + "\n" + p.authors.mkString("\n")) + w.println("[[[ ABSTR ]]]" + "\n" + p.abstr.text) + w.println("[[[ BODY ]]]" + "\n" + p.body.text) + w.println("[[[ REFS ]]]" + "\n" + p.refs.mkString("\n----\n")) + w.println("[[[ META ]]]" + "\n" + p.meta.mkString("\n")) + w.println("[[[ LINKS ]]]" + "\n" + p.links.mkString("\n----\n")) + w.close + } + + def load(file : File) : Paper = { + + // Printout + println("Loading file " + file.getName + " from cache") + + // Get file and read in lines + val lines : Iterator[String] = Source.fromFile(file).getLines + + // Variables + var vars : Map[String, List[String]] = Map.empty.withDefaultValue(Nil) + var current = "unknown"; + + // Order the information + for (l <- lines) l match { + case "[[[ ID ]]]" => current = "id" + case "[[[ INDEX ]]]" => current = "index" + case "[[[ TITLE ]]]" => current = "title" + case "[[[ AUTHORS ]]]" => current = "authors" + case "[[[ ABSTR ]]]" => current = "abstr" + case "[[[ BODY ]]]" => current = "body" + case "[[[ REFS ]]]" => current = "refs" + case "[[[ META ]]]" => current = "meta" + case "[[[ LINKS ]]]" => current = "links" + case line => vars = vars + (current -> (vars(current) ::: List(line))) + } + + return Paper(vars("id").head.toInt, + vars("index").head.toInt, + Title(vars("title").head), + stringToAuthors(vars("authors")), + Abstract(vars("abstr").mkString("\n")), + Body(vars("body").mkString("\n")), + stringToRefs(vars("refs")), + setMeta(vars("meta")), + stringToLinks(vars("links"))) + + } + + def setMeta(s : List[String]) : Map[String, String] = { + var m : Map[String, String] = Map.empty + // Looping through all the maps + for (e <- s) { + m = m + Pair((e.split(" -> "))(0), (e.split(" -> "))(1)) + } + return m + } + + def stringToLinks(s : List[String]) : List[Link] = { + return s.filter(l => l.split(" ").length == 2).map(l => Link(l.split(" ").head.toInt, l.split(" ").last.toInt)) + } + + def stringToAuthors(s : List[String]) : List[Author] = { + return s.map(l => Author(l)) + } + + def stringToRefs(s : List[String]) : List[Reference] = { + // Variables + var map : Map[String, List[String]] = Map.empty.withDefaultValue(Nil) + var index : Int = 0 + var current = "authors" + index + + // Make sure we have any references + if (s.head == "") return Nil + + // Order the information + for (l <- s) l match { + case "--" => current = "title" + index + case "----" => index += 1; current = "authors" + index + case line => map = map + (current -> (line :: map(current))) + } + + // Now gather it + val refs = for (i <- 0 to index) yield Reference(stringToAuthors(map("authors" + i)), Title(map("title" + i).head)) + return refs.toList + } +} + +trait LoadPaper { + + def loadAndParse(name : String, postfix : List[String], parser : Parsers, loader : FileLoader) : List[Paper] = { + println("BEGIN OF PARSING") + + // Get file handle of original file or directory + val orig = new File(name) + + // Check that directory or file exists + if (!orig.exists) println("Something is wrong with the file or directory in the argument : " + name) + + // If exists, set name and file + // In case it's a directory, let the file array contain all the files of the directory (regex utilization) + val files : List[File] = if(orig.isDirectory) SystemHelper.getFilesFromDirectory(orig) else List(orig) + val fnames : List[String] = if(orig.isDirectory) files.map(f => name ++ f.getName) else List(name) + + + // If postfix exists, try loading from cache + val somePapers : List[Option[Paper]] = if (postfix != Nil) files.map(f => loadFromCache(f, postfix)) else Nil + + // All papers that weren't loaded by cache are loaded by file + val finalPapers = somePapers.zip(files).map(p => if (p._1 == None) loadFromFile(p._2, parser, loader) else p._1) + + // Filter papers for None's and set index + val papers : List[Paper] = finalPapers.filter(p => p != None).zipWithIndex.map({case Pair(p,i) => p.get.setIndex(i) }).toList + + println("END OF PARSING") + + return papers + } + + + // Loads a paper from a text file and parses it. It has been modified in order to make loading and parsing flexible + def loadFromFile(file : File, p : Parsers, loader: FileLoader) : Option[Paper] = { + + // Check if file is bad or contains non numerical values (in the name) + if (checkIfBad(file) || """[^0-9]+""".r.findFirstIn(SystemHelper.name(file.getName())).isDefined) return None + + val result = loader.loadFromFile(file, p) + + // If paper doesn't exist and didn't parse, let's not parse it again + if(result == None) return isBadFile(file) + else Cache.save(result.get.clean, Cache.parsed) // Save and return + + return result + } + + + def isBadFile(file: File): Option[Paper] = { + println("Couldn't parse " + file.getName()) + Cache.bad(file) + return None + } + + def checkIfBad(file : File) : Boolean = { + + // Get file + val cached = new File(Cache.dir + file.getName + "." + Cache.bad) + + // Check that file exists + return (cached.exists) + } + + // Loads a paper from cache + def loadFromCache(file : File, postfix : List[String]) : Option[Paper] = { + // Helper function to load file + def getCache(postfix : String) : Option[Paper] = { + // Get file + val cached = new File(Cache.dir + file.getName + "." + postfix) + + // Check that file exists + if (!cached.exists) return None + + // Else, load away + return Some(Cache.load(cached)) + } + + postfix match { + // No postfix? - Return None + case Nil => None + // a postfix, well, let's try it + case p::ps => { + val ret = getCache(p) + // If we didn't find anything in cache, recurse + if (ret == None) loadFromCache(file, ps) + // Else return result + else return ret + } + } + } +} + + +object CacheLoader extends LoadPaper{ + def load(paperPos:String, postfix : String): List[Paper] = { + // Get file handle of original file or directory + val orig = new File(paperPos) + + // Check that directory or file exists + if (!orig.exists) sys.error("Problem with file path") + + val files : List[File] = if(orig.isDirectory()) orig.listFiles.toList else List(orig) + + // If postfix exists, try loading from cache + val papers : List[Option[Paper]] = files.map(f => loadFromCache(f, List(postfix))) + + return papers.filter((p:Option[Paper]) => p != None).map((p:Option[Paper]) => p.get) + } + } \ No newline at end of file diff --git a/app/paper/ParsePaper.scala b/app/paper/ParsePaper.scala new file mode 100644 index 0000000..536f361 --- /dev/null +++ b/app/paper/ParsePaper.scala @@ -0,0 +1,273 @@ +package paper + +import scala.collection.immutable.Stream +import scala.collection.immutable.StringOps +import scala.util.parsing.input._ +import scala.io.Source +import java.io.File + +abstract class Parsers { + def parse(file : Source) : Option[Paper] +} + +trait ParsePaper { + + def getText(in: Source): Stream[Char] = in.hasNext match { + case false => in.close(); Stream.Empty + case true => in.next #:: getText(in) + } + + + object Isit extends Parsers { + + def paper : Parser[Paper] = ( + //title ~ authors ~ dropLinesUntil("R EFERENCES") ~ references + dropLinesUntil("R EFERENCES") ~ references + ^^ { case b~r => Paper(0,0,Title(""),Nil,Abstract("Not saved"),Body("Not saved"),r,Map.empty,List()) } + | dropLinesUntil("References") ~ references + ^^ { case b~r => Paper(0,0,Title(""),Nil,Abstract("Not saved"),Body("Not saved"),r,Map.empty,List()) }) + + + def title : Parser[Title] = ( + line && ("Title:" ~> rest) + ^^ trim ^^ (s => Title(s.init.mkString))) + + def authors : Parser[List[Author]] = ( + line && "Author: " ~> split(", ") + ^^ (as => { + if (as.length == 0) Nil + else { + var names = as.init.map(a => a.mkString) ::: List(as.last.mkString.init) + names.map(a => Author(formatAuthor(a.mkString))) + } + })) + + def abstr : Parser[Abstract] = ( + dropLinesUntil("Abstract") ~> takeLinesUntil("I. ") + ^^ (t => Abstract(t.mkString))) + + def body : Parser[Body] = ( + takeLinesUntil("R EFERENCES") + ^^ (t => Body(t.mkString))) + + def refBracket : Parser[Input] = ( + "[" ~ rep(number) ~ "] " ^^^ Stream.Empty) + + def refLine : Parser[Input] = ( + takeLinesUntil("\n" | refBracket)) + + def refAuthors : Parser[List[Author]] = ( + until(", “") && split(", and " | " and " | ", ") + ^^ { x => x.init.map { a => Author(a.mkString) } } ) + + def refTitle : Parser[Title] = ( + until(",\"" | "\"," | "\"." | ".") ^^ (s => Title(s.mkString)) // Character modification. Possible errors in the future !!! + | success(Title(""))) + + def reference : Parser[Reference] = ( + refLine && (refAuthors ~ refTitle) + ^^ { case a~t => Reference(a, t) }) + + def references : Parser[List[Reference]] = ( + dropLinesUntil(refBracket) ~> rep(reference) ^^ cleanRefs) + + + // The function for actually parsing a paper + def parse(file : Source) : Option[Paper] = { + paper(getText(file)) match { + case Failure(msg, rest) => println("Failure: " + msg); None + case Success(result, rest) => Some(result.setMeta("parsed" -> "yes")) + } + } + } + + + def cleanRefs(refs : List[Reference]) : List[Reference] = { + val ret = for (r <- refs if r.title.t.stripMargin != "" && r.authors.length > 0) yield { + // Clean authors + var authors = for (a <- r.authors if a.name.stripMargin.length > 2) yield Author(a.name.stripMargin) + Reference(authors, r.title) + } + return ret + } + + def formatAuthor(name : String) : String = { + var result = "" + var names = name.split(" ").filter(n => n.length > 0) + if (names.length > 0) { + result = names.init.filter(n => n.length > 0).map(n => n.head).mkString("",". ",". ") + names.last + } + return result + } + + + + type Input = Stream[Char] + sealed abstract class Result[+T] + case class ~ [+T, +U](r1 : T, r2 : U) + case class Success[T](result: T, in : Input) extends Result[T] + case class Failure(msg : String, in : Input) extends Result[Nothing] + + val lineSep : List[Char] = List('\n','\r') + val tokenSep : List[Char] = List(' ',',','.',':','[',']','-') ::: lineSep + + abstract class Parser[+T] extends (Input => Result[T]) { + p => + + def ~ [U](q: => Parser[U]) = new Parser[T~U] { + def apply(in: Input) = p(in) match { + case Success(x, in1) => q(in1) match { + case Success(y, in2) => Success(new ~(x, y), in2) + case Failure(msg, in) => Failure(msg, in) + } + case Failure(msg, in) => Failure(msg, in) + } + } + + def | [U >: T](q: => Parser[U]) = new Parser[U] { + def apply(in: Input) = p(in) match { + case s @ Success(x, rest) => s + case Failure(_,_) => q(in) + } + } + + def ^ [U](f: (T, Input) => U, g: (T, Input) => Input) : Parser[U] = new Parser[U] { + def apply(in: Input) = p(in) match { + case Success(x, rest) => Success(f(x, rest), g(x, rest)) + case Failure(msg, in) => Failure(msg, in) + } + } + + def ^^ [U](f: T => U) : Parser[U] = p ^ ({ case (x, _) => f(x) }, { case (_,r) => r }) + def ^^^ [U](v: U) = p ^^ { case _ => v } + def ~> [U](q: => Parser[U]) = (p ~ q) ^^ { case a~b => b } + def <~ [U](q: => Parser[U]) = (p ~ q) ^^ { case a~b => a } + + def &&[U](q: => Parser[U]) = new Parser[U] { + def apply(in: Input) = p(in) match { + case f @ Failure(msg, in) => f + case Success(x1, rest) => { + // Because I don't know the result of parser q, I'm backtracing to figure out what was matched + var matched = in.zipWithIndex.takeWhile{case (x,i) => !in.drop(i).equals(rest)}.unzip._1 + q(matched) match { + case Success(x2, rest2) => Success(x2, rest) + case Failure(msg, rest2) => Failure(msg, rest) + } + } + } + } + } + + //def inLine[U](q : => Parser[U]) : Parser[U] = q <~ "\n" + + def line : Parser[Input] = next(lineSep) + def token : Parser[Input] = next(tokenSep) + + def number : Parser[Input] = "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "0" + + def trim(s : Input) : Input = s.dropWhile(c => c==' ' || c=='\t') + def line(p : Parser[Input]) : Parser[Input] = line && p && rest + + + // Splits input by letting the parser p be the delimiter + def split(p : Parser[Input]) : Parser[List[Input]] = rep(until(p)) + + def until[U](p : => Parser[U]) : Parser[Input] = new Parser[Input] { + override def apply(in: Input) : Result[Input] = { + + def gather(s : Input, soFar : Input) : Result[Input] = p(s) match { + case Success(result, rest) => Success(soFar, rest) + case Failure(msg, Stream.Empty) => Success(soFar, Stream.Empty) + case Failure(_, _) => gather(s.drop(1), soFar ++ s.take(1)) + } + + if (in == Stream.Empty) return Failure("[Until] Reached end of input", in) + else return gather(in, Stream.Empty) + } + } + + // Much faster than the naive version, and also avoids a stackoverflow + def takeLinesUntil[U](p : => Parser[U]) : Parser[Input] = new Parser[Input] { + override def apply(in: Input) : Result[Input] = { + + def gather(s : Input, soFar : Input) : Result[Input] = { + p(s) match { + case Success(result, rest) => Success(soFar, rest) + case Failure(msg, Stream.Empty) => Success(soFar, Stream.Empty) + case Failure(_, _) => gather(s.dropWhile(c => c != '\n').drop(1), soFar ++ s.takeWhile(c => c != '\n').drop(1)) + } + } + + if (in == Stream.Empty) return Failure("[takeLinesUntil] Reached end of input", in) + else return gather(in, Stream.Empty) + } + } + + //def takeLinesUntil(p : Parser[Input]) = rep(line(p)) + def dropLinesUntil[U](p : Parser[U]) = takeLinesUntil(p) ~> success(Stream.Empty) + + + def not(s: String) : Parser[Input] = new Parser[Input] { + override def apply(in: Input) : Result[Input] = in.take(s.length).mkString == s match { + case true => Failure("Failed: " + s + " was matched", in) + case false => Success(Stream.Empty, in) + } + } + + def opt [T](q : => Parser[T]) : Parser[Option[T]] = ( + q ^^ { case a => Some(a) } + | success(None)) + + def rep [T](p : => Parser[T]) : Parser[List[T]] = ( + p ~ rep(p) ^^ { case a~b => a::b } + | success(List())) + + def repsep [T, U](p : => Parser[T], sep : => Parser[U]) : Parser[List[T]] = ( + p ~ rep(sep ~> p) ^^ { case a~b => a::b } + | success(List())) + + def success[T](v: T) : Parser[T] = new Parser[T] { + override def apply(in: Input) : Result[T] = Success(v,in) + } + + def rest : Parser[Input] = new Parser[Input] { + override def apply(in: Input) : Result[Input] = { + in.length match { + case 0 => Failure("[Rest] Reached end of input",in) + case n => Success(in, Stream.Empty) + } + } + } + + def next(sep : List[Char]) : Parser[Input] = new Parser[Input] { + def apply(in: Input) = in.span(c => !sep.contains(c)) match { + case (Stream.Empty, Stream.Empty) => Failure("[Next] Reached end of input", in) + case (head, rest) => Success(head.init, rest.drop(1)) + } + } + + implicit def predicate(p: String => Boolean) : Parser[Input] = new Parser[Input] { + def apply(in: Input) = { + p(in.mkString) match { + case true => Success(Stream.Empty, in) + case false => Failure("Couldn't match predicate on:\n" + in.mkString, Stream.Empty) + } + } + } + + implicit def str(t: String) : Parser[Input] = new Parser[Stream[Char]] { + def apply(in: Input) = in.take(t.length).mkString == t match { + case true => Success(in.take(t.length), in.drop(t.length)) + case _ => Failure("Couldn't match expected: '" + t + "' given " + in.mkString, in) + } + } + + implicit def resultToStream(r: Result[Input]) : Input = r match { + case Success(x, rest) => x + case Failure(_,_) => Stream.Empty + } + + // //implicit def Stream2Str(s : Stream[Char]) : String = s.mkString + +} + diff --git a/app/paper/RecognitionClasses.scala b/app/paper/RecognitionClasses.scala new file mode 100644 index 0000000..0a5f49b --- /dev/null +++ b/app/paper/RecognitionClasses.scala @@ -0,0 +1,124 @@ +package paper + +object ExtractionRegexes { + val and = """(and|&)""" + val quoteB = """(“|\")""" + val quoteE = """(”|\")""" + val tabulation = """ \t """ + val authorsSeparator = """( ?, and | ?, | and | ?, \t | \t | & )""" + val referencesName = """(REFERENCES|R EFERENCES|References|R eferences)""" + val titleTermination = """[\.\?]""" +} + +// This class is a the top of the hierarchy +abstract class ReferenceProcessor { + def extract(ref: String): (Option[Title], Option[List[Author]]) +} + +// This class tries to extract information using a list of existent defined formats +object ReferenceProcessorExecuter { + val recognitionClasses: List[ReferenceProcessor] = List[ReferenceProcessor](NumberedReferenceProcessor1, NumberedReferenceProcessor2, TextualReferenceProcessor1) + + def extract(ref: String): (Option[Title], Option[List[Author]]) = { + def extract0(refProcessors: List[ReferenceProcessor]): (Option[Title], Option[List[Author]]) = refProcessors match{ + case List() => (None, None) + case x::List() => x.extract(ref) + case x::xs => { + val input = extract0(xs) + val title = input._1 + val authors = input._2 + + if(title == None && authors == None) x.extract(ref) + else (title, authors) + } + } + + extract0(recognitionClasses.reverse) + } +} + + + +// This class can extract information out of a reference having the following format: [digit] authors "title" +object NumberedReferenceProcessor1 extends ReferenceProcessor { + def extract(ref: String): (Option[Title], Option[List[Author]]) = { + def extractTitle: Option[Title] = { + val t = (ExtractionRegexes.quoteB + """.+""" + ExtractionRegexes.quoteE).r.findFirstIn(ref) + if(!t.isDefined) return None + + // Dropping the " characters + val title = t.get.drop(1).dropRight(1) + val finalTitle = if(title.last.equals(',')) title.dropRight(1) else title + Some(new Title(finalTitle)) + } + + def extractAuthors: Option[List[Author]] = { + val t = ("""^\[\d+\].+""" + ExtractionRegexes.quoteB).r.findFirstIn(ref) + if(!t.isDefined) return None + + val totAuths = t.get + val auths = ("""^\[\d+\]""").r.replaceAllIn(totAuths.dropRight(1), "") + + val authorsStringList = ("""([A-Z]\.[ -])+.+?( """ + ExtractionRegexes.and + """|,)""").r.findAllIn(auths).toList + + Some(authorsStringList.map((s:String) => new Author(("""( """ + ExtractionRegexes.and + """)""").r.replaceAllIn(s, "").replace(",", "")))) + } + + if(("""^\[\d+\].+""" + ExtractionRegexes.quoteB + """.+""" + ExtractionRegexes.quoteE + """(.+)?$""").r.findFirstIn(ref).isDefined){ + (extractTitle, extractAuthors) + } else (None, None) + } +} + + +// This class can extract information out of a reference having the following format: [digit] authors. title +object NumberedReferenceProcessor2 extends ReferenceProcessor { + def extract(ref: String): (Option[Title], Option[List[Author]]) = { + def extract0(ref: String): (Option[Title], Option[List[Author]]) = { + val authRegex = ("""([A-Z]\.[ -])+.+?( """ + ExtractionRegexes.and + """|,|, """ + ExtractionRegexes.and + """|\.)""").r + + val ref2 = ("""^\[\d+\]( )""").r.replaceAllIn(ref, "") + val authorsStringList = authRegex.findAllIn(ref2).toList + + val auths = Some(authorsStringList.map((s:String) => new Author(("""( """ + ExtractionRegexes.and + """)""").r.replaceAllIn(s, "").replace(",", "").dropRight(1)))) + + val t = ("""^.+?""" + ExtractionRegexes.titleTermination + """""").r.findFirstIn(authRegex.replaceAllIn(ref2, "")) + if(!t.isDefined) return (None, None) + + // Dropping until a the capital character is founded + val title = Some(new Title(t.get.replace(".", "").dropWhile((c:Char) => """[^A-Z]""".r.findFirstIn(""+c).isDefined))) + + (title, auths) + } + + if(("""^\[\d+\].+$""").r.findFirstIn(ref).isDefined) extract0(ref) + else (None, None) + } +} + +// This class can extract information out of a reference having the following format: authors (digits) title +object TextualReferenceProcessor1 extends ReferenceProcessor { + def extract(ref: String): (Option[Title], Option[List[Author]]) = { + def extractTitle: Option[Title] = { + val t = ("""\)\..+?""" + ExtractionRegexes.titleTermination + """""").r.findFirstIn(ref) + if(!t.isDefined) return None + + Some(new Title(t.get.drop(3))) + } + + def extractAuthors: Option[List[Author]] = { + val t = """^.+?\(""".r.findFirstIn(ref) + if(!t.isDefined) return None + + // Dropping the " (" string + val auths = t.get.dropRight(2) + + val authorsStringList = (""".+?,( [A-Z]\.)+(, (""" + ExtractionRegexes.and + """ )?)?""").r.findAllIn(auths).toList + + Some(authorsStringList.map((s:String) => new Author(("""( """ + ExtractionRegexes.and + """)""").r.replaceAllIn(s, "").replace(",", "")))) + } + + if(("""^.+?\(\d+\)\..+?\..+?\.$""").r.findFirstIn(ref).isDefined) (extractTitle, extractAuthors) + else (None, None) + } +} diff --git a/app/paper/Terms.scala b/app/paper/Terms.scala new file mode 100644 index 0000000..7329835 --- /dev/null +++ b/app/paper/Terms.scala @@ -0,0 +1,106 @@ +package paper + +/** Abstract Syntax Trees for terms. */ +sealed abstract class Term + + +// Try to keep this immutable +case class Paper(val id : Int, + val index : Int, + val title: Title, + val authors: List[Author], + val abstr: Abstract, + val body: Body, + val refs: List[Reference], + val meta: Map[String, String], + val links : List[Link]) extends Term { + + // Add a field that contains options, such as parsed and linked + + val parsed : Boolean = false + val linked : Boolean = false + + override def toString: String = title + "\n" + authors.mkString(", ") + "\n" + abstr + "\n" + body + "\n" + refs.mkString("\n") + + def getDistinctNames : List[String] = { + val as = authors ::: refs.flatMap(r => r.authors) + val names = as.map(a => a.toString) + return names.distinct + } + + def getTitle : Title = title + + def getAuthors : List[Author] = authors + def getAbstract : Abstract = abstr + def getBody : Body = body + def getReferences : List[Reference] = refs + + def clean : Paper = + return Paper(id, index, title, authors.filter(a => a.name.length > 4), abstr, body, refs.map(r => r.clean), meta, links) + + def setMeta(p : (String, String)) : Paper = + return Paper(id, index, title, authors, abstr, body, refs, meta + p, links) + + def setTitle(t : Title) : Paper = + return Paper(id, index, t, authors, abstr, body, refs, meta, links) + + def setAuthors(as : List[Author]) : Paper = + return Paper(id, index, title, as, abstr, body, refs, meta, links) + + def hasMeta(l : String) : Boolean = (meta.get(l) != None) + + def setId(newId : Int) : Paper = + return Paper(newId, index, title, authors, abstr, body, refs, meta, links) + + def setIndex(newIndex : Int) : Paper = + return Paper(id, newIndex, title, authors, abstr, body, refs, meta, links) + + def setLinks(newLinks : List[Link]) : Paper = + return Paper(id, index, title, authors, abstr, body, refs, meta, newLinks) + + def setAbstract(newAbstract : Abstract) : Paper = + return Paper(id, index, title, authors, newAbstract, body, refs, meta, links) + + def setBody(newBody : Body) : Paper = + return Paper(id, index, title, authors, abstr, newBody, refs, meta, links) + + def setReferences(newRefs : List[Reference]) : Paper = + return Paper(id, index, title, authors, abstr, body, newRefs, meta, links) +} + +case class Title(t: String) extends Term { + override def toString: String = t + + def getText: String = t +} + +case class Author(name: String) extends Term { + override def toString: String = name + + def getName: String = name +} + +case class Abstract(text: String) extends Term { + override def toString : String = "Abstract:\t" + text.take(40) + " ... " + + def getText: String = text +} + +case class Body(text: String) extends Term { + override def toString: String = "Body:\t\t" + text.take(100) ++ " ... \n" + + def getText: String = text +} + +case class Reference(authors: List[Author], title: Title) extends Term { + def clean : Reference = return Reference(authors.filter(a => a.name.stripMargin.length > 0), title) + override def toString : String = authors.mkString("\n") + "\n--\n" + title + + def getAuthors: List[Author] = authors + def getTitle: Title = title +} + +case class Link(index : Int, weight : Int) extends Term { + override def toString : String = index + " " + weight +} + diff --git a/app/paper/XMLObjects.scala b/app/paper/XMLObjects.scala new file mode 100644 index 0000000..712b412 --- /dev/null +++ b/app/paper/XMLObjects.scala @@ -0,0 +1,108 @@ +package paper + +// This class represents a position (can be a paragraph or even a page) +class XMLPosition(x: Int, y: Int, width: Int, height: Int) { + def getX: Int = x + def getY: Int = y + def getWidth: Int = width + def getHeight: Int = height + + override def toString(): String = "[x= " + x + ", y= " + y + ", width= " + width + ", height= " + height + "]" +} + +// This class defines a particular font (with the totality of xml ids defining it) +class XMLFont(IDs:String, size: String, family: String, color: String) { + def getID: String = IDs + def getSize: String = size + def getFamily: String = family + def getColor: String = color + + // This method adds a new xml id to the definition of the font + def addID(id: String): XMLFont = new XMLFont(IDs + "-" + id, size, family, color) + + // This method checks if a particular xml id is defining that font. Use "xmlFont.checkID(myid)" instead of "xmlFont.getID == myid" + def checkID(id: String): Boolean = ("(^(" + IDs.replace("-", "|") + ")$)|(^(" + IDs.replace("-", "|") + ")-)|(-(" + IDs.replace("-", "|") + ")$)|(-(" + IDs.replace("-", "|") + ")-)").r.findFirstIn(id).isDefined + + // This method checks if two xml font are basically the same + def compareXMLFont(font: XMLFont): Boolean = (size == font.getSize && family == font.getFamily && color == font.getColor) +} + +// This class contains the fonts of the document +class XMLFontsContainer(fonts: List[XMLFont]){ + def getXMLFont(id: String): Option[XMLFont] = fonts.find(f => f.checkID(id)) + def filter (f:XMLFont => Boolean) = fonts.filter(f) +} + + +object XMLParagraphOptions { + val CENTERED = "CTR" + val PAGE_CENTERED = "PCT" + val JUSTIFY = "JFY" + val COLUMN_LEFT = "CLL" + val COLUMN_RIGHT = "CLR" + val NO_COLUMN = "NCL" + val ENUMERATION = "ENM" + val NONE = "NON" +} + +class XMLParagraphOptionsContainer(value: String) { + def getValue: String = value + def addOption(option: String): XMLParagraphOptionsContainer = if(!hasOption(option)) new XMLParagraphOptionsContainer(value + "|" + option) else this + def hasOption(option: String): Boolean = ("^(" + value + ")$").r.findFirstIn(option).isDefined + def removeOption(option: String): XMLParagraphOptionsContainer = new XMLParagraphOptionsContainer(value.replace("|" + option, "")) +} + +class XMLLine(fontID: String, position: XMLPosition, text: String) { + def getFontID: String = fontID + def getPosition: XMLPosition = position + def getText: String = text + + def setFontID(newFont: String) = new XMLLine(newFont, position, text) + def addText(newLine: XMLLine): XMLLine = new XMLLine(fontID, new XMLPosition(position.getX, position.getY, (newLine.getPosition.getX + newLine.getPosition.getWidth) - position.getX, position.getHeight), text + newLine.getText) + def setText(newText: String): XMLLine = new XMLLine(fontID, position, newText) +} + +// This class represents a paragraph contained in a page +class XMLParagraph(fontID: String, position: XMLPosition, options: XMLParagraphOptionsContainer, lines: List[XMLLine], linesSeparator: String, text: String, enumFormat: String) { + def getFontID: String = fontID + def getPosition: XMLPosition = position + def getText: String = text + def getLines: List[XMLLine] = lines + def getEnumerationFormat: String = enumFormat + + def addOption(option: String): XMLParagraph = new XMLParagraph(fontID, position, options.addOption(option), lines, linesSeparator, text, enumFormat) + def hasOption(option: String): Boolean = options.hasOption(option) + def removeOption(option: String): XMLParagraph = new XMLParagraph(fontID, position, options.removeOption(option), lines, linesSeparator, text, enumFormat) + def getOptionsValue: String = options.getValue + + // This method adds a new XMLLine to the paragraph. However, the added line will be the first one on the list, so pay attention! + // The text and position arguments will be correctly updated + def addLine(line: XMLLine): XMLParagraph = new XMLParagraph(fontID, new XMLPosition(Math.min(position.getX, line.getPosition.getX), position.getY, Math.max(position.getX + position.getWidth, line.getPosition.getX + line.getPosition.getWidth) - Math.min(position.getX, line.getPosition.getX), (line.getPosition.getY + line.getPosition.getHeight) - position.getY), options, line :: lines, linesSeparator, text + linesSeparator + line.getText, enumFormat) + def addParagraph(newParagraph: XMLParagraph): XMLParagraph = new XMLParagraph(fontID, new XMLPosition(Math.min(position.getX, newParagraph.getPosition.getX), position.getY, Math.max(position.getX + position.getWidth, newParagraph.getPosition.getX + newParagraph.getPosition.getWidth) - Math.min(position.getX, newParagraph.getPosition.getX), (newParagraph.getPosition.getY + newParagraph.getPosition.getHeight) - position.getY), options, newParagraph.getLines ::: lines, linesSeparator, text + linesSeparator + newParagraph.getText, enumFormat) + def reverseLines: XMLParagraph = new XMLParagraph(fontID, position, options, lines.reverse, linesSeparator, text, enumFormat) +} + +// This class is the representation of a page +class XMLPage(number: Int, position: XMLPosition, paragraphs: List[XMLParagraph]) { + def getNumber: Int = number + def getPosition: XMLPosition = position + def getParagraphs: List[XMLParagraph] = paragraphs +} + +// This class defines the entire xml file structure +class XMLDocument(fontsContainer: XMLFontsContainer, pages: List[XMLPage]) { + private val paragraphs = { + def get(pageList: List[XMLPage], accu: List[XMLParagraph]):List[XMLParagraph] = pageList match { + case List() => accu + case x::xs => get(xs, accu ::: x.getParagraphs) + } + + get(pages, List()) + } + + def getFontsContainer: XMLFontsContainer = fontsContainer + def getPage(pageNumber: Int): XMLPage = (pages.filter(p => p.getNumber == pageNumber)).head + def getParagraphs: List[XMLParagraph] = paragraphs +} + + diff --git a/app/paper/XMLObjectsManager.scala b/app/paper/XMLObjectsManager.scala new file mode 100644 index 0000000..3e7a51d --- /dev/null +++ b/app/paper/XMLObjectsManager.scala @@ -0,0 +1,115 @@ +package paper +import scala.xml.XML +import scala.xml.Elem +import scala.xml.NodeSeq + + +object XMLObjectsManager { + + def getCleanXMLParagraph(lineSeparator: String): XMLParagraph = new XMLParagraph("", new XMLPosition(0, 0, 0, 0), new XMLParagraphOptionsContainer(XMLParagraphOptions.NONE), List(), lineSeparator, "", "") + + // Since the xml file sometimes creates different fonts with the same features (size, family, etc), it + // is important to create a structure that takes into account this detail and hides it. + // The XMLFont class performs this job. XMLFontsContainer just contains the list of XMLFont objects + private def getFontsContainer(xml: Elem): Option[XMLFontsContainer] = { + val pages = (xml \\ "page") + + + // This method updates the previous XMLFont list with the new fonts defined in the page. + def getXMLFontListFromPage(previousList: List[XMLFont], pageNumber: String): List[XMLFont] = { + // Recursive method that runs through the page's fonts list and updates the given XMLFont list + def constructFontLists(fonts: NodeSeq, accu: List[XMLFont]): List[XMLFont] = { + + // This method runs through the given XMLFont list and checks if the new XMLFont defined object is already in the list + def checkIfExist(xmlFont: XMLFont, list: List[XMLFont], accu: List[XMLFont]): List[XMLFont] = list match{ + case List() => (xmlFont :: accu).reverse // Putting the new XMLFont at the end of the list + case x::xs => if(x.compareXMLFont(xmlFont)) (x.addID(xmlFont.getID) :: accu).reverse ::: xs // Updating the given XMLFont list + else checkIfExist(xmlFont, xs, x :: accu) + } + + fonts.isEmpty match { + case true => accu + case false => { + // Creating then new XMLFont object according to the font parameters + val xmlFont = new XMLFont((fonts.head \ "@id").text, (fonts.head \ "@size").text, (fonts.head \ "@family").text, (fonts.head \ "@color").text) + constructFontLists(fonts.tail, checkIfExist(xmlFont, accu, List())) + } + } + + } + + // Getting the page + val page = pages filter((n) => (n \ "@number").text.equals(pageNumber)) + + if(page.length != 1) previousList + else { + val fonts = page.head \\ "fontspec" + + // Updating the XMLFont list + constructFontLists(fonts, previousList) + } + } + + // Method for running through the pages list + def constructUntilPage(pagesNumber: Int, accu: List[XMLFont], currentPage: Int): List[XMLFont] = { + if(currentPage > pagesNumber) accu + else constructUntilPage(pagesNumber, getXMLFontListFromPage(accu, currentPage.toString()), currentPage + 1) + } + + // Getting the xml fonts list + val xmlFonts = constructUntilPage(pages.length, List(), 1) + + // Creating the container + if(xmlFonts.length >= 1) Some(new XMLFontsContainer(xmlFonts)) + else None + } + + // This method creates the pages of the document. The final list is reversed + private def constructXMLPages(pages: NodeSeq, fontsContainer: Option[XMLFontsContainer], lineSeparator: String): Option[List[XMLPage]] = { + // This method constructs the xml page + def constructXMLPage(page: xml.Node): Option[XMLPage] = { + try { + // First we get the global page parameters + val number = (page \ "@number").text.toInt + val x = (page \ "@top").text.toInt + val y = (page \ "@left").text.toInt + val width = (page \ "@width").text.toInt + val height = (page \ "@height").text.toInt + val position = new XMLPosition(x, y, width, height) + // Construction of the paragraphs + val paragraphs = XMLParagraphsConstructor.constructXMLParagraphs(page, position, fontsContainer.get, lineSeparator) + + if (paragraphs != None) Some(new XMLPage(number, position, paragraphs.get)) + else None + }catch { + case _ => None + } + + } + + // Recursive method in order to run through the pages list + def constructXMLPages0(pages: NodeSeq, accu: Option[List[XMLPage]]): Option[List[XMLPage]] = { + if(pages.isEmpty) accu + else { + val page = constructXMLPage(pages.head) + + if(page == None) None + else constructXMLPages0(pages.tail, Some(page.get :: accu.get)) + } + } + + if(fontsContainer != None) constructXMLPages0(pages, Some(List())) + else None + } + + // This method constructs the xml document of the file + def constructXMLDocument(xml:Elem, lineSeparator:String): Option[XMLDocument] = { + // getting fontsContainer and xml pages + val fontsContainer = getFontsContainer(xml) + val xmlPages = constructXMLPages(xml \\ "page", fontsContainer, lineSeparator) + + // If everything is fine, then create the document + if(xmlPages != None) Some(new XMLDocument(fontsContainer.get, xmlPages.get.reverse)) + else None + } +} \ No newline at end of file diff --git a/app/paper/XMLParagraphsConstructor.scala b/app/paper/XMLParagraphsConstructor.scala new file mode 100644 index 0000000..5b11604 --- /dev/null +++ b/app/paper/XMLParagraphsConstructor.scala @@ -0,0 +1,180 @@ +package paper +import scala.xml.NodeSeq + +object XMLParagraphsConstructor { + private class ParagraphDelimiter(value: Int, tolerance: Int) { + // !!! Equal to = (assignment, not comparing) + def == (cValue: Int): ParagraphDelimiter = new ParagraphDelimiter(cValue, tolerance) + + def >= (cValue: Int): Boolean = (cValue <= value + tolerance) + def <= (cValue: Int): Boolean = (cValue >= value - tolerance) + def > (cValue: Int): Boolean = (cValue < value + tolerance) + def < (cValue: Int): Boolean = (cValue > value - tolerance) + + // Comparing + def === (cValue: Int): Boolean = this >= cValue && this <= cValue + def != (cValue: Int): Boolean = !(this === cValue) + } + + + def constructXMLParagraphs(page: xml.Node, pagePosition: XMLPosition, fontsContainer: XMLFontsContainer, lineSeparator: String): Option[List[XMLParagraph]] = { + val lines = page \\ "text" + val pageCenter = new ParagraphDelimiter(pagePosition.getWidth / 2, 3) + + def createXMLLine(line: xml.Node) = new XMLLine((line \ "@font").text, new XMLPosition((line \ "@left").text.toInt, (line \ "@top").text.toInt, (line \ "@width").text.toInt, (line \ "@height").text.toInt), line.text) + + // This method incorporates the lines and finds out the real line font (see documentation) + def incorporate(nextLines: NodeSeq, currentLine: XMLLine, fontLengthMap: Map[String, Int]): (NodeSeq, XMLLine) = nextLines.isEmpty match{ + case true => { + // Determining of the real line font + val maxFontLength = fontLengthMap.toList.map((f:(String, Int)) => f._2).max + (nextLines, currentLine.setFontID(fontLengthMap.filter((p:(String, Int)) => p._2 == maxFontLength).head._1)) + } + case false => + val nextLine = createXMLLine(nextLines.head) + val currentLineYRange = new ParagraphDelimiter(currentLine.getPosition.getY, currentLine.getPosition.getHeight - 3) + val tabulation = if(nextLine.getPosition.getX - (currentLine.getPosition.getX + currentLine.getPosition.getWidth) >= 2*currentLine.getPosition.getHeight) " \t " else " " + + + // If the top of the next line is in the range, then it must be added to the current line + if(currentLineYRange === nextLine.getPosition.getY) { + // Counting of the characters having a particular font + val nextFontLength = fontLengthMap.get(nextLine.getFontID) + // If a font is always present in the map, then add to the current value the number of characters, otherwise create a new font key + val newMap = if(nextFontLength == None) fontLengthMap + ((nextLine.getFontID, (tabulation + nextLine.getText).length)) else fontLengthMap - (nextLine.getFontID) + ((nextLine.getFontID, (tabulation + nextLine.getText).length + nextFontLength.get)) + + incorporate(nextLines.tail, currentLine.addText(nextLine.setText(tabulation + nextLine.getText)), newMap) + } + else { + // Determining of the real line font + val maxFontLength = fontLengthMap.toList.map((f:(String, Int)) => f._2).max + (nextLines, currentLine.setFontID(fontLengthMap.filter((p:(String, Int)) => p._2 == maxFontLength).head._1)) + } + } + + // This is the main step of the process (see documentation). The output of the method is (newParagraph, line included?, paragraph will continue?) + def linkLine(line: XMLLine, paragraph: XMLParagraph): (XMLParagraph, Boolean, Boolean) = { + def constructEnumFormat(line: String): String = { + val enumRegex = """^([^0-9]*)?[0-9]+([^0-9]+?) """ + + if(line.length < 5) return "" + else { + val result = enumRegex.r.findFirstIn(line.take(5)) + if(result.isDefined) return """[0-9]""".r.replaceAllIn(result.get, "{d}").dropRight(1) + else return "" + } + } + + val lineCenter = ((2 * line.getPosition.getX) + line.getPosition.getWidth) / 2 + val enumFormat = constructEnumFormat(line.getText) + + // First paragraph's line (rule 1) + if(paragraph.getLines.length == 0) { + val optionContainer = if(pageCenter === lineCenter) (new XMLParagraphOptionsContainer(XMLParagraphOptions.NONE)).addOption(XMLParagraphOptions.PAGE_CENTERED) else new XMLParagraphOptionsContainer(XMLParagraphOptions.NONE) + val enumeratedOptionContainer = if(enumFormat != "") optionContainer.addOption(XMLParagraphOptions.ENUMERATION) else optionContainer + + return (new XMLParagraph(line.getFontID, line.getPosition, enumeratedOptionContainer, List(line), lineSeparator, line.getText, enumFormat), true, true) + } + + // Calculate some important parameters + val previousLineCenter = new ParagraphDelimiter(((2 * paragraph.getLines.head.getPosition.getX) + paragraph.getLines.head.getPosition.getWidth) / 2, 3) + val previousLineText = paragraph.getLines.head.getText + val previousLineTop = paragraph.getLines.head.getPosition.getY + val previousLineHeight = paragraph.getLines.head.getPosition.getHeight + val previousLineBegin = new ParagraphDelimiter(paragraph.getLines.head.getPosition.getX, 3) + val previousLineEnd = new ParagraphDelimiter(paragraph.getLines.head.getPosition.getX + paragraph.getLines.head.getPosition.getWidth, 3) + val lineBegin = line.getPosition.getX + val lineEnd = line.getPosition.getX + line.getPosition.getWidth + val lineTop = line.getPosition.getY + val lineHeight = line.getPosition.getHeight + val previousLineCapitalVersurNotDifference = """[A-Z]""".r.findAllIn(previousLineText).length - """[a-z]""".r.findAllIn(previousLineText).length + val lineCapitalVersurNotDifference = """[A-Z]""".r.findAllIn(line.getText).length - """[a-z]""".r.findAllIn(line.getText).length + + // Rule 2 + if(lineTop - previousLineTop > 2*lineHeight || !fontsContainer.getXMLFont(paragraph.getFontID).get.checkID(line.getFontID)) return (paragraph, false, false) + // Rule 3: The current line is not a title but previous is (contains only capital letters), even if the have the same font + if(lineCapitalVersurNotDifference <= 0 && previousLineCapitalVersurNotDifference > 0) return (paragraph, false, false) + // If both lines have the same enumeration format (which must be defined), then the current line is a new enumeration, hence it belongs to a new paragraph + if(paragraph.hasOption(XMLParagraphOptions.ENUMERATION) && enumFormat == paragraph.getEnumerationFormat) return (paragraph, false, false) + + + // This is the second paragraph's line + if(paragraph.getLines.length == 1) { + // Rule 4 + if(previousLineBegin === lineBegin && previousLineEnd === lineEnd && previousLineCenter === lineCenter) return (paragraph.addLine(line).addOption(XMLParagraphOptions.CENTERED).addOption(XMLParagraphOptions.JUSTIFY), true, true) + // Rule 5 + else if(previousLineBegin != lineBegin && previousLineEnd != lineEnd && previousLineCenter === lineCenter) return (paragraph.addLine(line).addOption(XMLParagraphOptions.CENTERED), true, true) + // Rule 6 + else if(previousLineEnd >= lineEnd) return (paragraph.addLine(line).addOption(XMLParagraphOptions.JUSTIFY), true, true) + } + + // Normal line (more than the second) + if(paragraph.getLines.length > 1) { + if(paragraph.hasOption(XMLParagraphOptions.CENTERED) && !paragraph.hasOption(XMLParagraphOptions.JUSTIFY)) { + // Rule 7.a + if(previousLineCenter === lineCenter) return (paragraph.addLine(line), true, true) + } + else if(!paragraph.hasOption(XMLParagraphOptions.CENTERED) && paragraph.hasOption(XMLParagraphOptions.JUSTIFY)) { + // Rule 8.a + if(previousLineBegin === lineBegin && previousLineEnd === lineEnd) return (paragraph.addLine(line), true, true) + // Rule 8.b + else if(previousLineBegin === lineBegin && previousLineEnd > lineEnd) return (paragraph.addLine(line), true, false) + } + else if(paragraph.hasOption(XMLParagraphOptions.CENTERED) && paragraph.hasOption(XMLParagraphOptions.JUSTIFY)) { + // Rule 9.a + if(previousLineBegin === lineBegin && previousLineEnd === lineEnd) return (paragraph.addLine(line), true, true) + // Rule 9.b + else if(previousLineBegin != lineBegin && previousLineEnd != lineEnd && previousLineCenter === lineCenter) return (paragraph.addLine(line).removeOption(XMLParagraphOptions.JUSTIFY), true, true) + // Rule 9.c + else if(previousLineBegin === lineBegin && previousLineEnd > lineEnd) return (paragraph.addLine(line).removeOption(XMLParagraphOptions.CENTERED), true, false) + } + } + + // For all other cases + (paragraph, false, false) + } + + + // This method applies the layout rules. + def setLayout(paragraph: XMLParagraph): XMLParagraph = { + if(pageCenter > paragraph.getPosition.getX + paragraph.getPosition.getWidth) paragraph.addOption(XMLParagraphOptions.COLUMN_LEFT) + else if(pageCenter < paragraph.getPosition.getX) paragraph.addOption(XMLParagraphOptions.COLUMN_RIGHT) + else paragraph.addOption(XMLParagraphOptions.NO_COLUMN) + } + + def constructNewParagraph(lines: NodeSeq, currentParagraph: XMLParagraph): (NodeSeq, XMLParagraph) = lines.isEmpty match{ + case true => (lines, currentParagraph) + case false => + val tempLine = createXMLLine(lines.head) + // Incorporation + val incorporation = incorporate(lines.tail, tempLine, Map((tempLine.getFontID, tempLine.getText.length))) + + val remainingLines = incorporation._1 + // Linkage + val linkage = linkLine(incorporation._2, currentParagraph) + + // If the line was added + val finalRemainingLines = if(linkage._2 == true) remainingLines else lines + + // If the paragraph continues + if(linkage._3 == true) constructNewParagraph(finalRemainingLines, linkage._1) + else (finalRemainingLines, linkage._1) + } + + // This method constructs the paragraphs. Be careful because the final list is reversed + def constructParagraphs(lines: NodeSeq, accu: List[XMLParagraph]): List[XMLParagraph] = lines.isEmpty match { + case true => accu + case false => + val construction = constructNewParagraph(lines, XMLObjectsManager.getCleanXMLParagraph(lineSeparator)) + val layoutParagraph = setLayout(construction._2.reverseLines) + constructParagraphs(construction._1, layoutParagraph :: accu) + } + + // Global page processing + def processGlobalPage(paragraphs: List[XMLParagraph]): List[XMLParagraph] = paragraphs.remove(p => (p.getLines.length == 1 && p.getText.length() <= 3) || p.getPosition.getX < 0 || (p.getPosition.getX + p.getPosition.getWidth) > (pagePosition.getX + pagePosition.getWidth) || p.getPosition.getY < 0 || (p.getPosition.getY + p.getPosition.getHeight) > (pagePosition.getY + pagePosition.getHeight)) + + val finalParagraphs = processGlobalPage(constructParagraphs(lines, List()).reverse) + + Some(finalParagraphs) + } +} \ No newline at end of file diff --git a/app/paper/XMLParser.scala b/app/paper/XMLParser.scala new file mode 100644 index 0000000..f4157fd --- /dev/null +++ b/app/paper/XMLParser.scala @@ -0,0 +1,79 @@ +package paper +import scala.io.Source +import scala.xml.XML +import scala.xml.Elem +import scala.xml.NodeSeq +import scala.xml.TypeSymbol +import scala.util.matching.Regex.MatchIterator +import sun.nio.cs.Unicode + + +object XMLParser extends Parsers with TitleExtractor1 + with AuthorsExtractor1 + with AbstractExtractor1 + with BodyExtractor1 + with ReferencesExtractor1{ + + val extractionOrder: List[(Paper, XMLDocument, List[XMLParagraph]) => (List[XMLParagraph], Paper)] = List(extractTitle, extractAuthors, extractAbstract, extractBody, extractReferences) + + + // This method returns the xml representation of the text contained in the Source object + def getXMLObject(in: Source): Option[Elem] = { + // String generation and illegal xml characters removing + val text = in.mkString.replace("" + '\uffff', "").replace("" + "\u001f", "").replace("\u0010", "").replace("\u000b", "").replace("\u000c", "") + // This instruction is important, otherwise the xml file can't be deleted + in.close + + try { + // The replacement of the and tags is important because loadString sometimes generate an exception about these tags + // Of course, some information is lost, but not really an important one + Some(XML.loadString("""""".r.replaceAllIn(text, ""))) + } catch { + case _ => println("Couldn't load the XML file."); None + } + } + + + // Method for references extraction following the extraction order + def extract(extractors: List[(Paper, XMLDocument, List[XMLParagraph]) => (List[XMLParagraph], Paper)], t : (XMLDocument, Option[Paper], List[XMLParagraph])): (XMLDocument, Option[Paper], List[XMLParagraph]) = { + def extract0(extractors: List[(Paper, XMLDocument, List[XMLParagraph]) => (List[XMLParagraph], Paper)], t : (XMLDocument, Option[Paper], List[XMLParagraph])): (XMLDocument, Option[Paper], List[XMLParagraph]) = { + if(extractors.length == 0) return t + + val input = if(extractors.length == 1) t else extract0(extractors.tail, t) + val xml = input._1 + val paper = input._2 + val paragraphs = input._3 + + if(paper != None) { + // Calling the extraction method of the extractor + val extraction = extractors.head(paper.get, xml, paragraphs) + return (xml, Some(extraction._2), extraction._1) + } + + (xml, None, paragraphs) + } + + extract0(extractors.reverse, t) + } + + // The function for actually parsing a paper + def parse(in: Source) : Option[Paper] = { + val xml = getXMLObject(in) + + if(xml == None) None + else { + val cleanPaper = Paper(0, 0, Title(""), Nil, Abstract("Not saved"), Body("Not saved"), List(), Map.empty, List()) + val xmlDocument = XMLObjectsManager.constructXMLDocument(xml.get, "\n") + + // print + //xmlDocument.get.getParagraphs.foreach((p : XMLParagraph) => println(p.getText + "\n" + p.getOptionsValue + "\n" + p.getEnumerationFormat + "\n\n\n")) + + if(xmlDocument == None) return None + val paper = extract(extractionOrder, (xmlDocument.get, Some(cleanPaper), xmlDocument.get.getParagraphs)) + + if(paper._2 == None) None + else Some(paper._2.get.setMeta("parsed" -> "yes")) + } + } + +} \ No newline at end of file diff --git a/app/paper/XMLScheduleParser.scala b/app/paper/XMLScheduleParser.scala new file mode 100644 index 0000000..f02a25e --- /dev/null +++ b/app/paper/XMLScheduleParser.scala @@ -0,0 +1,153 @@ +package paper +import java.io.File + +trait XMLScheduleParser { + + import scala.xml._ + import scala.collection.immutable.Map._ + + // Overall function that loads the xml schedule and returns the papers with the extra data + def getXMLSchedule(paperPos : String, papers : Option[List[Paper]]) : List[Paper] = { + println("BEGIN OF XML SCHEDULING") + val path = paperPos + Paths.sep + "schedule.xml" + + val schedule = (new File(path)).exists() + + val loadedPapers = if(papers == None) CacheLoader.load(paperPos, Cache.parsed) else papers.get + + // Parse schedule + val xml : Map[Int, Elem] = if(schedule) parse(path) else Map() + + println("END OF XML SCHEDULING") + // match schedule with papers + return matchXML(xml, loadedPapers, schedule) + } + + // Function for taking care of parsing the xml + def parse(paperPos : String) : Map[Int, Elem] = { + + // Load schedule file + val schedule : Elem = XML.loadFile(paperPos) + + // Initialize Map + var data : Map[Int, Elem] = Map.empty + + // For each session record room, date, session + (schedule \\ "session") foreach (session => { + + var date = (session \ "date").text + var room = (session \ "room").text + var sess = (session \ "code").text + ": " + (session \ "sessiontitle").text + + // Now for each paper record starttime, endtime, paperid, papertitle, abstract + (session \\ "paper") foreach (paper => { + // Create a hunk of xml containing all the data + var xml = { date }{ room }{ sess }{ paper } + var id = (paper \\ "paperid").text + data = data + (id.toInt -> xml) + }) + }) + + // Return the map of all the data + return data + } + + // Function for putting the xml in the right paper + def matchXML(xml : Map[Int, Elem], papers : List[Paper], schedule: Boolean) : List[Paper]= { + def apply(p: Paper, data: Option[Elem] ): Paper = { + // Get resulting paper + val result = setXMLData(data, p, schedule) + + // Save result + Cache.save(result, Cache.scheduled) + + // Return result + result + } + + // Loop through all papers and add the xml elements the appropriate one + return for (p <- papers) yield { + val xmlObject = xml.get(p.id) + if(schedule && xmlObject == None) {println("No schedule data for paper with id: " + p.id); p} + else apply(p, xmlObject) + } + } + + // Putting the xml in a paper + def setXMLData(xmlObject : Option[Elem], paper : Paper, schedule: Boolean) : Paper = { + if(schedule) { + val xml = xmlObject.get + paper.setMeta("xmldate" -> getDate(xml)) + .setMeta("xmlroom" -> getRoom(xml \\ "room")) + .setMeta("xmlsession" -> (xml \\ "sess").text) + .setMeta("xmlstarttime" -> (xml \\ "starttime").text) + .setMeta("xmlendtime" -> (xml \\ "endtime").text) + .setMeta("xmlpaperid" -> (xml \\ "paperid").text) + .setMeta("xmlsessionid" -> (xml \\ "sessionid").text) + .setMeta("xmlpapertitle" -> (xml \\ "papertitle").text) + .setMeta("xmlabstract" -> (xml \\ "abstract").text) + .setMeta("xmlauthors" -> getAuthors(xml \\ "authors").mkString(", ")) + .setTitle(new Title((xml \\ "papertitle").text)) + .setAuthors(getAuthors(xml \\ "authors").map(a => Author(formatAuthors(a)))) + } else { + paper.setMeta("xmldate" -> "-") + .setMeta("xmlroom" -> "-") + .setMeta("xmlpapertitle" -> paper.title.toString()) + .setMeta("xmlauthors" -> paper.authors.mkString(", ")) + } + } + + // Converts an authors XML note to string + def getAuthors(authors : NodeSeq) : List[String] = { + return (for (a <- (authors \ "author")) yield (a \\ "name").text).toList + } + + def getRoom(room : NodeSeq) : String = { + room.text match { + case "Track 1" => "Kresge Rehearsal B (030)" + case "Track 2" => "Kresge Auditorium (109)" + case "Track 3" => "Stratton S. de P. Rico (202)" + case "Track 4" => "Stratton 20 Chimneys (306)" + case "Track 5" => "Kresge Little Theatre (035)" + case "Track 6" => "Kresge Rehearsal A (033)" + case "Track 7" => "Stratton (407)" + case "Track 8" => "Stratton (491)" + case "Track 9" => "Stratton West Lounge (201)" + } + } + + def getDate(xml : Elem) : String = { + import java.util.Calendar + import java.sql.Timestamp + + var date = (xml \\ "date").text + var time = (xml \\ "starttime").text + + var dateNum : Int = date.takeWhile(_.isDigit).toInt + var hourNum : Int = time.split(':').head.toInt + var minNum : Int = time.split(':').last.toInt + + // Get Calendar + var c = Calendar.getInstance + + // Set starting point + c.set(2012, 6, dateNum, hourNum, minNum) + c.set(Calendar.SECOND,0) + c.set(Calendar.MILLISECOND,0) + + // Get a timeStamp + var t = (new Timestamp(c.getTime.getTime).getTime).toString + + return t + } + + def formatAuthors(name : String) : String = { + var result = "" + var names = name.split(" ").filter(n => n.length > 0) + if (names.length > 0) { + result = names.init.filter(n => n.length > 0).map(n => n.head).mkString("",". ",". ") + names.last + } + return result + } + +} diff --git a/ajax.php b/app/views/ajax.php similarity index 95% rename from ajax.php rename to app/views/ajax.php index 65fa69e..1997f9b 100644 --- a/ajax.php +++ b/app/views/ajax.php @@ -26,11 +26,11 @@ function get_abstract() { // Get ID $id = $_GET["id"]; - $file = "data/".$id.".abstract.txt"; + $file = $id.".abstract.txt"; // Make sure we have sufficient zeros in front for ($i = 0; $i < 10 && !file_exists( $file ); $i++) - $file = "data/".str_repeat("0",$i).$id.".abstract.txt"; + $file = str_repeat("0",$i).$id.".abstract.txt"; // Now get contents (which are formatted as UTF-16) $content = file_get_contents($file, FILE_TEXT); diff --git a/index.html b/app/views/index.scala.html old mode 100755 new mode 100644 similarity index 65% rename from index.html rename to app/views/index.scala.html index 41126d5..59d711d --- a/index.html +++ b/app/views/index.scala.html @@ -1,42 +1,48 @@ +@(link: String) +@import helper._ + + + + - + - - - + + + - - - - - + + + + + - - + + + + - - Trailhead: A Graphical Representation of Articles -
-