/co-teeft

Script permettant de créer des enrichissments "teeft" (keywords)

Primary LanguageJavaScript

co-teeft

Script permettant de créer des enrichissments "teeft" (keywords)

Usages

Ce programme traite des fichiers .json contenant un tableau de docObjects sous la forme :

[
  { ... },
  { ... },
  { ... },
  { ... }
]

Pour lancer l'indexation, utiliser les commandes suivantes :

$ node index.js --help
Usage: index [options]

Options:
  --input <input>    required   input file
  --output <output>  required   output file
  --conf <conf>      optionnal   conf file (default: "conf.default.json")
  -h, --help         output usage information
$ node index.js --input=path/to/my/file.json --output=output.json
# Will create 2 files :
# output.json -> file containing enrichments (you could inserted it into mongodb)
# output.indexation.json -> file containing Teeft results (usefull to get more infos and not just keywords)

Configuration

Le fichier de configuration devra ressembler à :

{
  "langages": [
    "en",                 // enable english indexation, remove it if you don't need it
    "fr"                  // enable french indexation, remove it if you don't need it
  ],
  "conf": {
    "fr": {
      "id": "sourceUid",   // Identifier of document (used for logs)
      "data": [            // Selectors of parts of text you want to process
        "title.fr",
        "abstract.fr"
      ],
      "enrichment": {      // Enrichment informations
        "selectors": [         // Selectors used to identify document (used to build 'selectors' property of enrichment object)
          "sourceUid"
        ],
        "target": {            // Target of enrichment (used to build 'target' property of enrichment object)
          "from": "parent",
          "selector": "",
          "key": "enrichments.keywords.fr"
        }
      }
    },
    "en": {
      "id": "sourceUid",
      "data": [
        "title.en",
        "abstract.en"
      ],
      "enrichment": {
        "selectors": [
          "sourceUid"
        ],
        "target": {
          "from": "parent",
          "selector": "",
          "key": "enrichments.keywords.en"
        }
      }
    }
  }
}

Toutes ces informations permettront de créer des données d'enrichissements comme cela :

{
  "selectors": [
    {
      "selector": "sourceUid",
      "values": [
        "hal$halshs-00961648"
      ]
    }
  ],
  "target": {
    "from": "parent",
    "selector": "",
    "key": "enrichments.keywords.fr"
  },
  "value": [
    {
      "frequency": 30,
      "strength": 1,
      "specificity": 1,
      "probability": 0.11952191235059761,
      "term": "social"
    },
    {
      "frequency": 19,
      "strength": 1,
      "specificity": 0.6333333333333333,
      "probability": 0.07569721115537849,
      "term": "sociabilité"
    },
    {
      "frequency": 15,
      "strength": 1,
      "specificity": 0.5,
      "probability": 0.05976095617529881,
      "term": "lien"
    },
    {
      "frequency": 10,
      "strength": 1,
      "specificity": 0.3333333333333333,
      "probability": 0.0398406374501992,
      "term": "réseaux"
    },
    {
      "frequency": 9,
      "strength": 1,
      "specificity": 0.3,
      "probability": 0.035856573705179286,
      "term": "technologie"
    },
    {
      "frequency": 8,
      "strength": 1,
      "specificity": 0.2666666666666666,
      "probability": 0.03187250996015936,
      "term": "relation"
    },
    {
      "frequency": 8,
      "strength": 2,
      "specificity": 0.2666666666666666,
      "probability": 0.03187250996015936,
      "term": "réseaux social"
    },
    {
      "frequency": 7,
      "strength": 2,
      "specificity": 0.23333333333333328,
      "probability": 0.027888446215139442,
      "term": "lien social"
    },
    {
      "frequency": 7,
      "strength": 1,
      "specificity": 0.23333333333333328,
      "probability": 0.027888446215139442,
      "term": "nouvelle"
    },
    {
      "frequency": 6,
      "strength": 1,
      "specificity": 0.19999999999999998,
      "probability": 0.02390438247011952,
      "term": "fracture"
    },
    {
      "frequency": 6,
      "strength": 1,
      "specificity": 0.19999999999999998,
      "probability": 0.02390438247011952,
      "term": "déclin"
    },
    {
      "frequency": 6,
      "strength": 1,
      "specificity": 0.19999999999999998,
      "probability": 0.02390438247011952,
      "term": "communication"
    },
    {
      "frequency": 5,
      "strength": 2,
      "specificity": 0.16666666666666666,
      "probability": 0.0199203187250996,
      "term": "nouvelle technologie"
    },
    {
      "frequency": 5,
      "strength": 1,
      "specificity": 0.16666666666666666,
      "probability": 0.0199203187250996,
      "term": "affaiblissement"
    },
    {
      "frequency": 5,
      "strength": 1,
      "specificity": 0.16666666666666666,
      "probability": 0.0199203187250996,
      "term": "ligne"
    },
    {
      "frequency": 5,
      "strength": 1,
      "specificity": 0.16666666666666666,
      "probability": 0.0199203187250996,
      "term": "internet"
    },
    {
      "frequency": 5,
      "strength": 1,
      "specificity": 0.16666666666666666,
      "probability": 0.0199203187250996,
      "term": "groupe"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "transformation"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "classe"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "distance"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "vision"
    },
    {
      "frequency": 4,
      "strength": 2,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "fracture numérique"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "années"
    },
    {
      "frequency": 4,
      "strength": 1,
      "specificity": 0.1333333333333333,
      "probability": 0.01593625498007968,
      "term": "numérique"
    },
    {
      "frequency": 3,
      "strength": 2,
      "specificity": 0.09999999999999999,
      "probability": 0.01195219123505976,
      "term": "capital social"
    },
    {
      "frequency": 3,
      "strength": 2,
      "specificity": 0.09999999999999999,
      "probability": 0.01195219123505976,
      "term": "lien faible"
    }
  ]
}