Need Lib for Next Instractions

#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from Kuytu.wikiLog import KuytuLog
from Kuytu.WikiDumpParser import WikiDumpParser
import Kuytu.file_commander as K_fc

import json

Create custom Log from Kuytu

log = KuytuLog('BlackBoxCode')

Wiki Dump Seperation Stars

Giving Wiki-Dump XML path wait for seperation

# ----------- WIKI DUMP XML PARSE ------------------------------- Execution ----
# XML Getting Memory
print '-'*10 + 'WIKI DUMP XML GETTING MEMORY' + '-'*10 
wdp = WikiDumpParser('./Data/wikidump.xml')
#wdp = WikiDumpParser('./Data/part.xml') # A Part of The Orjinal WIKI-DUMP Data



# Starting to parse all page in XML file
print '-'*14 + 'WIKI DUMP PARSE START' + '-'*13
wdp.extract_pages(StoreAllText = False, NumberofParagraph = 2 )
# ------------------------------------------------------------------------------

----------WIKI DUMP XML GETTING MEMORY----------
--------------WIKI DUMP PARSE START-------------

After Seperation saving Seperation Log to our 'Kuytu_log' file

# ----------- WIKI DUMP XML PARSE -------------- Save Log Info (of execution)---
# non_article_count, no_infoBox_count, error_count, number_of_total_article, number_of_article_has_infoBox
log.save_log('WIKI DUMP PARSE - RESULT', json.dumps(wdp.getLog(),indent=4,ensure_ascii=False, encoding='utf8') )
# ------------------------------------------------------------------------------

Take the [Uniq Info Box Type - Hit Count] data and Graph it

# ----------- WIKI DUMP XML PARSE -------------- Save Uniq InfoBoxCounts (B.K.)-
output_path = log.get_output_path()
#print json.dumps( wdp.get_uniqInfoBoxTypes(),indent=4,ensure_ascii=False, encoding='utf8')
K_fc.save_Uniq_InfoBoxTypes( output_path + '/Uniq-BK-Types-Hit-Counts.txt', wdp.get_uniqInfoBoxTypes() )
K_fc.save_Graph( output_path = output_path
                ,data = wdp.get_uniqInfoBoxTypes()
                ,min_repetition = 100
                ,title = 'Uniq-BK-Types-Hit-Counts-Graph(>100)' )
log.logging('Uniq-BK-Types-Hit-Counts-Graph(>100) Saved')
# ------------------------------------------------------------------------------

<Figure size 2000x1100 with 1 Axes>

Save The Seperated Pages in 3 Group

-> Template of the returnin valu of wdp.get_all_articles()
{ 
    "withInfoBox_articles_list" 	: [....articleObject...],
    "withOUTInfoBox_articles_list"  : [..(article_XML_TEXT,Article_Title,Article_Id)..],
    "NonStandart_articles_list" 	: [..article_XML_TEXT..]
}

# ----------- WIKI DUMP XML PARSE ----------------------- Save Article Pages ---
allArticles = wdp.get_all_articles()


# Pages WithOut InfoBox 
withOUTInfoBoxPagesXMLPath = output_path + 'BulkData/withOUTInfoBoxPages_bulkXML.xml'
withOUTInfoBoxPagesIndexPath = output_path + 'BulkData/withOUTInfoBoxPages_bulkXML_index.txt'
l1 = K_fc.save_XML(withOUTInfoBoxPagesXMLPath, withOUTInfoBoxPagesIndexPath, allArticles['withOUTInfoBox_articles_list'] )

# Pages With InfoBox 
StandartPagesXMLPath = output_path + 'BulkData/withInfoBoxPages_bulkXML.xml'
StandartPagesIndexPath = output_path + 'BulkData/withInfoBoxPages_bulkXML_index.txt'
l2 = K_fc.save_XML(StandartPagesXMLPath, StandartPagesIndexPath, allArticles['withInfoBox_articles_list'] )

# Pages - NonStandart
nonStandartPagesXMLPath = output_path + 'BulkData/NonStandartPages_bulkXML.xml'
nonStandartPagesIndexPath = output_path + 'BulkData/NonStandartPages_bulkXML_index.txt'
l3 = K_fc.save_XML(nonStandartPagesXMLPath, nonStandartPagesIndexPath, allArticles['NonStandart_articles_list'] )

log.logging([l1,l2,l3])  ## Save '.save_XML' log.

# ------------------------------------------------------------------------------

xmlParseCharRef: invalid xmlChar value 55296, line 23152, column 36 (line 23152)
!! [prettyPrintXml] Execution Had Some Errors!!
!-- 690516 Article Saved Successfully -- FileName(withOUTInfoBoxPages_bulkXML.xml)----------!
!-- 180223 Article Saved Successfully -- FileName(   withInfoBoxPages_bulkXML.xml)----------!
xmlParseCharRef: invalid xmlChar value 55357, line 15669, column 514 (line 15669)
!! [prettyPrintXml] Execution Had Some Errors!!
!--   3157 Article Saved Successfully -- FileName(   NonStandartPages_bulkXML.xml)----------!

### First part of the Wiki Full Extraction is finished

#### The Outputs are ;

Cleaning The Parsed Data (With InfoBoxes)

import Kuytu.Analyzer as Analyzer
from collections import Counter

##### When code runs from begining
Articles_with_BK = allArticles['withInfoBox_articles_list']
allArticles = None

when code starts to run here

from Kuytu.wikiLog import KuytuLog from Kuytu.WikiDumpParser import WikiDumpParser import Kuytu.file_commander as K_fc import json

log = KuytuLog('BlackBoxCode','r') output_path = log.get_output_path()

StandartPagesXMLPath = '../test_data.xml'

#StandartPagesXMLPath = output_path + 'BulkData/withInfoBoxPages_bulkXML.xml'

Articles_with_BK = K_fc.read_XML(StandartPagesXMLPath)

Saving Histogram of Clean DA's owned Articles

uniq_types = map(lambda a : a.get_infoBox_type() , Articles_with_BK)
c = Counter( uniq_types )
uniq_types_histogram = list(c.items())

output_path = log.get_output_path()
#print json.dumps( wdp.get_uniqInfoBoxTypes(),indent=4,ensure_ascii=False, encoding='utf8')
K_fc.save_Uniq_InfoBoxTypes( output_path + '/Uniq-BK-Types-Hit-Counts-Graph-Clean(>100).txt', uniq_types_histogram )
K_fc.save_Graph( output_path = output_path
                ,data = uniq_types_histogram
                ,min_repetition = 100
                ,title = 'Uniq-BK-Types-Hit-Counts-Graph-Clean(>100)' )
log.logging('Uniq-BK-Types-Hit-Counts-Graph-Clean(>100) Saved')

According to histogram graphs ınterested domain can be seperated

Seperation Interested Domain Related Info Box Types

Interested_Info_Box_Types = [ u'Hakem' ,u'Manken' ,u'Makam Sahibi' ,u'Filozof' ,u'Bilim Insanı',u'Güreşçi' 
                             ,u'Bilim Adamı' ,u'Sporcu' ,u'Buz Patencisi',u'Asker' 
                             ,u'Voleybolcu' ,u'Sanatçı',u'Futbolcu' ,u'Oyuncu' 
                             ,u'Müzik Sanatçısı' ,u'Yazar' ,u'Kraliyet' ,u'Tenis Sporcu' ,u'Profesyonel Güreşçi'
                             ,u'Kişi' ,u'Basketbolcu']
str_of_BKs = json.dumps(Interested_Info_Box_Types,indent = 4,ensure_ascii=False, encoding='utf8').encode('utf-8')
log.save_log('Interested ınfoBoxes are', str_of_BKs )
Interested_articles_with_BK =  filter(lambda a: a.get_infoBox_type() in Interested_Info_Box_Types , Articles_with_BK)


prt_log = '#Articles interested categories '+ str(len(Interested_articles_with_BK))

log.logging(prt_log)
print prt_log

#Articles interested categories 53658

After the seperation for a certain set of articles can clean

- InfoBox fields will clean - many banned key or value can be setted.

- Choosen first n paragraphs will clean

- All text of articles will clean

Cleaning The Texts of Articles

Cleaning Info Boxes Generally

from Kuytu.article_cleaner_kit import clean_InfoBoxBulk 

for i,article in enumerate(Interested_articles_with_BK):
    
    # read 
    bulk_BK_of_article  = article.get_infoBoxText()
    # clean
    clean_BK_of_article = clean_InfoBoxBulk(bulk_BK_of_article)
    # write
    Interested_articles_with_BK[i].set_infoBox_clean(clean_BK_of_article)
    
# Eliminate None parsed articles
Interested_articles_with_BK_clean =  filter(lambda a: a.get_cleanInfoBox() != None , Interested_articles_with_BK)

Paragraph Cleaning

from Kuytu.article_cleaner_kit import clean_paragraphs

for i,article in enumerate(Interested_articles_with_BK_clean):
    # read 
    paragraphs_of_article  = article.get_bulkParagraphs()
    # clean
    cleaned_paragraphs = clean_paragraphs(paragraphs_of_article)
    # write
    Interested_articles_with_BK_clean[i].set_cleanParagraphs(cleaned_paragraphs)

All Text Cleaning

from Kuytu.article_cleaner_kit import clean_bulk_text

for i,article in enumerate(Interested_articles_with_BK_clean):
    # read 
    bulk_text_of_article  = article.get_allBulkText()
    # clean
    cleaned_text = clean_bulk_text(bulk_text_of_article)
    # write
    Interested_articles_with_BK_clean[i].set_cleanText(cleaned_text)

Split First 2 sentences with java Zemberek and custom regex

from Kuytu import Zemberek_Runner  
Zemberek_Runner.create_sentence_splittler_input_text(Articles_with_BK)

RUN Java ON TERMINAL // Jupyter notebook  not allow to export zemberek lib

from Kuytu import Zemberek_Runner
Zemberek_Runner.create_sentences()

from Kuytu import Zemberek_Runner  

Zemberek_Runner.re_read_Sentences(Articles_with_BK)

prt_log = '#Articles parsed clean(InfoBoxes-Paragraphs-AllText-Sentences) '+ str(len(Interested_articles_with_BK_clean))
log.logging(prt_log)
print prt_log

#Articles parsed clean(InfoBoxes-Paragraphs-AllText-Sentences) 53503

According to cleaned Info Boxes data can investigating

- For each InfoBoxType that interested articles can be seperated.

- 2 different name but nearly same types can make one

- Info Boxes data field keys can be mapped to certain value

Analysis and Manipulating the Data

seperated_interested_articles = Analyzer.seperate_articles_according_to_type(Interested_articles_with_BK_clean)

format of 'seperated_interested_articles'

{
 "ınfo_box_type_1" : [.....list of articles....],
 "ınfo_box_type_2" : [.....list of articles....],
 "ınfo_box_type_3" : [.....list of articles....],
 "ınfo_box_type_4" : [.....list of articles....],
 "ınfo_box_type_5" : [.....list of articles....],
 "ınfo_box_type_6" : [.....list of articles....]
 }

Same domain Type merge

try:
    for i,a in enumerate(seperated_interested_articles[u"Bilim Adamı"]):
        seperated_interested_articles[u"Bilim Adamı"][i].set_infoBox_type(u'Bilim Insanı')
    seperated_interested_articles[u"Bilim Insanı"] += seperated_interested_articles[u"Bilim Adamı"]        
    del seperated_interested_articles[u"Bilim Adamı"]
except Exception as e:
    pass

Mapping Certain Info Box Types Keys

map_ = {
    u'Manken' :{
        "yer" : "doğumyeri"
    },
    u'Hakem' : {
        "etkinyıl" : "aktifyıl",
        "yıl" : "aktifyıl",
        "yer" : "doğumyeri"
    }
}         
seperated_interested_articles = Analyzer.datafield_map(map_, seperated_interested_articles)

# Data Fields Counting of InfoBoxes Types

total_count_data, count_data = Analyzer.data_field_counter(seperated_interested_articles)

output_path = log.get_output_path()
total_article_count = float(len(Interested_articles_with_BK_clean))
subcats_article_counts =  { type_ :float(len(list_)) for type_,list_ in seperated_interested_articles.items()}

Analyzer.save_count_data( path = output_path + 'Counts'
                        , total_count_data =  total_count_data
                        , count_data = count_data
                        , total_first_n = 100
                        , subcat_first_n = 20
                        , total_article_count = total_article_count
                        , subcats_article_counts = subcats_article_counts )

Exporting the Data Field Count of The InfoBox Types