Bläddra i källkod

Parseur xml pour trouver les infos

Loquicom 5 år sedan
förälder
incheckning
66e3ba7e2a
1 ändrade filer med 96 tillägg och 49 borttagningar
  1. 96 49
      src/extract/parser.js

+ 96 - 49
src/extract/parser.js

@@ -1,30 +1,28 @@
 const Saxophone = require('saxophone');
-const file = require('../../file');
-const skip = require('../../skip');
-
-const TAGS = [
-    'phdthesis',
-    'mastersthesis',
-    'incollection',
-    'book',
-    'inproceedings',
-    'proceedings',
-    'www',
-    'article'
-];
+const file = require('../file');
+const skip = require('../skip');
+const attr = require('./attribute');
 
 let instance = null;
 
 const parser = class Parser {
 
     constructor() {
-        this._auth = [];
-        this._inTag = false;
-        this._tag = '';
-        this._isAuth = false;
+        // Var de recolte de données
+        this._article = {};
+        this._proceed = {};
+        // Var de position dans le fichier
+        this._inArticle = false;
+        this._inProceed = false;
+        this._inAuth = false;
+        this._inTitle = false;
+        this._inYear = false;
+        this._key = '';
+        // Var gestion class
         this.callback = null;
         this.source = null;
-        this.dest = null;
+        this.auth = null;
+        // Var parametrage sax
         this.sax = new Saxophone();
         this.sax.on('error', this._error);
         this.sax.on('tagopen', this._opentag);
@@ -37,16 +35,8 @@ const parser = class Parser {
         if (this.source === null) {
             throw 'No source file';
         }
-        if (this.dest === null) {
-            throw 'No destination file';
-        }
-        if (!file.makedir(this.dest, true)) {
-            throw 'Unable to create destination file';
-        }
-        if (file.exist(this.dest)) {
-            if (!file.delete(this.dest)) {
-                throw 'Unable to delete existing destination file';
-            }
+        if (this.auth === null) {
+            throw 'No author to search';
         }
         this.callback = callback;
         file.fs.createReadStream(this.source, {start: skip.begin(this.source, '<!DOCTYPE')}).pipe(this.sax);
@@ -63,11 +53,8 @@ const parser = class Parser {
         return this;
     }
 
-    to(dest) {
-        if (!file.makedir(dest, true)) {
-            throw 'Unable to create destination folder';
-        }
-        this.dest = dest;
+    search(auth) {
+        this.auth = auth;
         return this;
     }
 
@@ -76,36 +63,96 @@ const parser = class Parser {
     }
 
     _opentag(tag) {
-        if (!instance._inTag && TAGS.indexOf(tag.name) !== -1) {
-            instance._inTag = true;
-            instance._tag = tag.name;
-            instance._auth = [];
-        } else if (!instance._isAuth && tag.name === 'author') {
-            instance._isAuth = true;
+        if (!instance._inArticle && tag.name === 'article') {
+            const attribute = attr.parse(tag.attrs);
+            instance._inArticle = true;
+            instance._key = attribute.key;
+            instance._article[instance._key] = {key: instance._key, auth: []};
+        } else if (!instance._inProceed && tag.name === 'inproceedings') {
+            const attribute = attr.parse(tag.attrs);
+            instance._inProceed = true;
+            instance._key = attribute.key;
+            instance._proceed[instance._key] = {key: instance._key, auth: []};
+        } else if (!instance._inAuth && tag.name === 'author') {
+            instance._inAuth = true;
+        } else if (!instance._inTitle && tag.name === 'title') {
+            instance._inTitle = true;
+        } else if (!instance._inYear && tag.name === 'year') {
+            instance._inYear = true;
         }
     }
 
     _closetag(tag) {
-        if (instance._inTag && instance._tag === tag.name) {
-            instance._inTag = false;
-            // On ne garde que les groupes d'auteurs
-            if (instance._auth.length > 1) {
-                file.append(instance.dest, JSON.stringify(instance._auth) + '\n');
+        if (instance._inArticle && tag.name === 'article') {
+            // Regarde si l'auteur recherché est dans la liste
+            if (instance._article[instance._key]['auth'].indexOf(instance.auth) === -1) {
+                delete instance._article[instance._key];
             }
-        } else if (instance._isAuth && tag.name === 'author') {
-            instance._isAuth = false;
+            // Reset
+            instance._inArticle = false;
+            instance._key = '';
+        } else if (instance._inProceed && tag.name === 'inproceedings') {
+            // Regarde si l'auteur recherché est dans la liste
+            if (instance._proceed[instance._key]['auth'].indexOf(instance.auth) === -1) {
+                delete instance._proceed[instance._key];
+            }
+            // Reset
+            instance._inProceed = false;
+            instance._key = '';
+        } else if (instance._inAuth && tag.name === 'author') {
+            instance._inAuth = false;
+        } else if (instance._inTitle && tag.name === 'title') {
+            instance._inTitle = false;
+        } else if (instance._inYear && tag.name === 'year') {
+            instance._inYear = false;
         }
     }
 
     _text(text) {
-        if (instance._isAuth) {
-            instance._auth.push(text.contents);
+        if (instance._inArticle) {
+            if (instance._inAuth) {
+                instance._article[instance._key]['auth'].push(text.contents);
+            } else if (instance._inTitle) {
+                instance._article[instance._key]['title'] = text.contents;
+            } else if (instance._inYear) {
+                instance._article[instance._key]['year'] = text.contents;
+            }
+        } else if (instance._inProceed) {
+            if (instance._inAuth) {
+                instance._proceed[instance._key]['auth'].push(text.contents);
+            } else if (instance._inTitle) {
+                instance._proceed[instance._key]['title'] = text.contents;
+            } else if (instance._inYear) {
+                instance._proceed[instance._key]['year'] = text.contents;
+            }
         }
     }
 
     _finish() {
+        // Recup resultat
+        const result = {
+            article: instance._article,
+            proceed: instance._proceed,
+            coauth: []
+        };
+        // Recup des co-autheurs
+        for (let key in result.article) {
+            result.article[key].auth.forEach(elt => {
+                if (elt !== instance.auth && result.coauth.indexOf(elt) === -1) {
+                    result.coauth.push(elt);
+                }
+            });
+        }
+        for (let key in result.proceed) {
+            result.proceed[key].auth.forEach(elt => {
+                if (elt !== instance.auth && result.coauth.indexOf(elt) === -1) {
+                    result.coauth.push(elt);
+                }
+            });
+        }
+        // Appel du callback
         if (instance.callback !== null) {
-            instance.callback(instance.dest);
+            instance.callback(result);
         }
     }