sub fill_Tables{ #Connecting to the database ############################################################# $dsn = "DBI:mysql:database=$database;host=$hostname"; $dbh = DBI->connect($dsn, $login, $password) || die "Couldn't connect to $database : $!\n"; #Filling tables ############################################################# #Filling table 'article' ############################################################# #Gathering data from daily output for insertion ############################################################# #Directory with Unitex output files my @output_concord_files = glob("output_concord/*.txt"); #Using 'glob' implies random order of files => sort @output_concord_files = map{$_->[1]} sort{$a->[0] <=> $b->[0]} map{/output_concord\/concord\.(.*)\.txt/; [$1, $_]} @output_concord_files; #Declaring variable for data extraction my ($output_concord_file, $url, $sys_time, $event); my (@output_concord_files_prepare, @url_prepare, @sys_time_prepare, @event_prepare, @index); #Opening, reading, and extracting column content of each concord.n.txt file foreach $output_concord_file(@output_concord_files){ #Note : for the concord file, no processing implied -> stored by default in $output_concord_file open (my $fh, '<:utf8', "$output_concord_file") || die "Couldn't open $output_concord_file : $!\n"; #Populating @output_concord_files_prepare array for column 'concord_file' insertion push @output_concord_files_prepare, $output_concord_file; while (<$fh>){ if ($_ =~ /=\[=(.*)=\]=/){ $url = $1; #Populating @url_prepare array for column 'url' from 'article' table insertion push @url_prepare, $url; } if ($_ =~ /=\[\+(.*)\+\]=/){ $sys_time = $1; #Populating @sys_time_prepare array for column 'sys_time' from 'article' table insertion push @sys_time_prepare, $sys_time; } if ($_ =~ /=E-(.*)=event/){ $event = $1; #Populating @event_duplicates array for column 'event' from 'event' table insertion push @event_prepare, $event; #print "$output_concord_file -> $event\n"; push @index, $output_concord_file, $event; } } } #Input files for extraction of column : html_extr_text my $dir_html_pages = 'html_pages'; #Opening directory for readdir method, extracting only '.txt' files -> cohabitation with '.html' files opendir (DIRHTMLPAGES, $dir_html_pages) || die "Coudln't open $dir_html_pages : $!"; #Extracting only the '.txt' format files my @html_pages_files = map{s/\.[^.]+$//;$_}grep{/\.txt$/} readdir DIRHTMLPAGES; closedir (DIRHTMLPAGES); #Mapping and sorting the output following the readdir method pattern the "html.n.txt" file number, where n = 1 .. n @html_pages_files = map{$_->[1]} sort{$a->[0] <=> $b->[0]} map{/html\.(.*)/; [$1, $_]} @html_pages_files; #Adding path 'html_pages/' and extension '.txt' of each file my @html_pages_files_extended; my $path = "html_pages\/"; my $extension = "\.txt "; my ($html_page_file, $extended); foreach $html_page_file(@html_pages_files){ $extended = $path . $html_page_file . $extension; #Populating @html_pages_files_extended for column html_extr_text insertion push @html_pages_files_extended, $extended; } #print "@output_concord_files_prepare\n"; #print "@url_prepare\n"; #print "@sys_time_prepare\n"; #print "@event_prepare\n"; #print "@html_pages_files_extended\n"; #Insertion of extracted and synchronized data in 'article' table of e_slide database my $i_article; my $id_article = 0; my @article_index; for ($i_article = 0; $i_article < @output_concord_files_prepare; $i_article++){ #$dbh->do(" # INSERT INTO `article`(`id_article`, `url`, `html_extr_text`,`concord_file`, `sys_time`) # VALUES ('$id_article', '$url_prepare[$i_article]', '$html_pages_files_extended[$i_article]', '$output_concord_files_prepare[$i_article]', '$sys_time_prepare[$i_article]') #") || die $dbh->errstr; push @article_index, $i_article; } #print "@article_event_index\n"; #print "@article_index\n"; $id_article++; #Insertion of extracted and synchonized data in 'event' table of e_slide database my $i_event; my $id_event = 0; my @event_index; for ($i_event = 0; $i_event < @event_prepare; $i_event++){ #$dbh->do(" # INSERT INTO `event`(`id_event`, `event`) # VALUES ('$id_event', '$event_prepare[$i_event]') #") || die $dbh->errstr; push @event_index, $i_event; } #print "@event_index\n"; $id_event++; my $index_concord; my @index_concord; foreach my $index(@index){ if ($index =~ /output_concord\/concord\.(.*)\.txt(.*)/){ $index_concord = $1; push @index_concord, $index_concord; } } #print "@index_concord\n"; #Insertion of extracted and sychronized indexes in 'article_event_index' my $i_article_event_index; for ($i_article_event_index; $i_article_event_index < @event_index; $i_article_event_index++){ $dbh->do(" INSERT INTO `article_event_index`(`id_article`, `id_event`) VALUES ('$index_concord[$i_article_event_index]', '$event_index[$i_article_event_index]') ") || die $dbh->errstr; } }