Wikipedysta:Alan ffm/checkwiki.pl.js
Z Wikipedii, wolnej encyclopedia
$last_pos = get_next_nowiki($pos_nowiki + $last_pos); $loop_again = 1; #print 'nowiki'.' '.$pos_nowiki.' '.$last_pos."\n"; } if ($tag_first eq 'nowiki' and $pos_nowiki_end == -1) { # found and no $last_pos = $pos_nowiki +1; $loop_again = 1; #print 'nowiki no end'."\n"; my $text_output = substr( $text,$pos_nowiki); $text_output = text_reduce($text_output, 80); error_023_nowiki_no_correct_end( $text_output ); }
- !/usr/local/bin/perl
- ################################################################# # Program: checkwiki.pl # Descrition: Scan all pages of a Wikipedia-Project (dump or live) for errors # Author: Stefan Kühn # Version: 0.2 # Licence: GPL ################################################################# ################################################################# # Syntax # perl -w checkwiki.pl -p=enwiki m=live ################################################################# # New features, last changes and discussion # http://de.wikipedia.org/wiki/Benutzer:Stefan_Kühn/Check_Wikipedia ################################################################# our $test_programm = 'true'; # only for program tests load_moduls(); # standard perl moduls declare_global_directorys(); declare_global_variables(); check_input_arguments(); open_file() if ($quit_program eq 'no'); #dump or live text_translation_input() if ($quit_program eq 'no'); #dump or live scan_pages() if ($quit_program eq 'no'); #scan all aricle close_file(); text_translation_output() if ($quit_program eq 'no'); output_errors() if ($quit_program eq 'no'); output_statistic() if ($quit_program eq 'no'); print $quit_reason if ($quit_reason ne ''); sub load_moduls{ ################################################################# # Load Module ################################################################# #use lib "C:/perl/lib"; use strict; use warnings; use URI::Escape; use LWP::UserAgent; use CGI::Carp qw(fatalsToBrowser); #use lib '/home/sk/perl/checkwiki'; our $file_module_coordinate = 'coordinates.pm'; if (-e $file_module_coordinate) { use coordinates ; } # use new_coordinates; #use lib '../module'; #use wikipedia; #use URI::Escape; #use LWP::UserAgent; } sub declare_global_directorys { our $dump_directory = '/mnt/user-store/dump/'; # toolserver # our $dump_directory = '../../dump/'; # home or usb our $output_directory = '../../data/checkwiki/'; our $input_directory_new = '../../data/new_article/'; our $input_directory_change = '../../data/last_changes/'; our $output_templatetiger = '../../data/templatetiger/'; our $output_geo = '../../data/geo/'; #our $dump_filename = '/mnt/user-store/dump/dewiki-20080607-pages-articles.xml'; #'Wikipedia-20080502083556.xml'; #our $dump_filename = '../../dump/dewiki-20071217-pages-articles.xml'; } sub declare_global_variables { ################################################################# # Declaration of variables (global) ################################################################# our $dump_or_live = ''; # scan modus (dump, live, only) our $silent_modus = ''; # silent modus (very low output at screen) for batch our $test_modus = ''; # silent modus (very low output at screen) for batch our $quit_program = 'no'; # quit the program (yes,no) our $quit_reason = ''; # quit the program reason our $time_start = time(); # start timer in secound our $time_end = time(); # end time in secound our $date = 0; # date of dump "20060324" our $line_number = 0; # number of line in dump our $project = ''; # name of the project 'dewiki' our $language = ''; # language of dump 'de', 'en'; our $page_number = 0; # number of pages in namesroom 0 our $base = ''; # base of article, 'http://de.wikipedia.org/wiki/Hauptseite' our $home = ''; # base of article, 'http://de.wikipedia.org/wiki/' our @namespace; # namespace values # 0 number # 1 namespace in project language # 2 namespace in english language our $namespaces_count = -1; # number of namespaces our @namespacealiases; # namespacealiases values # 0 number # 1 namespacealias our $namespacealiases_count= -1; # number of namespacealiases our @namespace_cat; #all namespaces for categorys our @namespace_image; #all namespaces for images our @namespace_templates; #all namespaces for templates our @magicword_defaultsort; our @magicword_img_thumbnail; our @magicword_img_manualthumb; our @magicword_img_right; our @magicword_img_left; our @magicword_img_none; our @magicword_img_center; our @magicword_img_framed; our @magicword_img_frameless; our @magicword_img_page; our @magicword_img_upright; our @magicword_img_border; our @magicword_img_sub; our @magicword_img_super; our @magicword_img_link; our @magicword_img_alt; our @magicword_img_width; our @magicword_img_baseline; our @magicword_img_top; our @magicword_img_text_top; our @magicword_img_middle; our @magicword_img_bottom; our @magicword_img_text_bottom; # Wiki-special variables our @live_article; # to-do-list for live (all articles to scan) our $current_live_article = -1; # line_number_of_current_live_article our $number_of_live_tests = -1; # Number of articles for live test our $current_live_error_scan = -1; # for scan every 100 article of an error our @live_to_scan ; # article of one error number which should be scanned our $number_article_live_to_scan = -1; # all article from one error our @article_was_scanned; #if an article was scanned, this will insert here our $error_counter = -1; # number of found errors in all article our @page_with_error; our @error_description; # Error Description # 0 priority # 1 title in English # 2 description in English # 3 number of found (only live scanned) # 4 priority of foreign language # 5 title in foreign language # 6 description in foreign language # 7 number of found in last scan (from statistic file) # 8 all known errors (from statistic file + live) our $number_of_max_errors = 100; # number of max error_description for (my $i = 0; $i <= $number_of_max_errors; $i++) { $error_description[$i][0] = -1; $error_description[$i][1] = ''; $error_description[$i][2] = ''; $error_description[$i][3] = 0; $error_description[$i][4] = -1; $error_description[$i][5] = ''; $error_description[$i][6] = ''; $error_description[$i][7] = 0; $error_description[$i][8] = 0; } our $max_error_count = 50; # maximum of shown article per error our $maximum_current_error_scan = -1; # how much shold be scanned for reach the max_error_count our $rest_of_errors_not_scan_yet = ''; our $number_of_all_errors_in_all_articles = 0; #all errors our $for_statistic_new_article = 0; our $for_statistic_last_change_article = 0; our $for_statistic_geo_article = 0; our $for_statistic_number_of_articles_with_error = 0; our $error_geo_counter = -1; # number of found errors in all article our @page_with_geo_error; our @error_geo_description; our $number_of_max_geo_errors = 100; for (my $i = 0; $i <= $number_of_max_geo_errors; $i++) { $error_geo_description[$i][0] = -1; $error_geo_description[$i][1] = ''; $error_geo_description[$i][2] = ''; $error_geo_description[$i][3] = 0; $error_geo_description[$i][4] = -1; $error_geo_description[$i][5] = ''; $error_geo_description[$i][6] = ''; $error_geo_description[$i][7] = 0; $error_geo_description[$i][8] = 0; } our $live_filename = 'input_for_live.txt'; our $output_live_wiki = 'output_for_wikipedia.txt'; our $output_dump_wiki = 'output_for_wikipedia_dump.txt'; our $error_list_filename = 'error_list.txt'; our $error_list_filename_only = 'error_list_only.txt'; our $error_list_filename_dump = 'error_list_dump.txt'; #all errors from the last dump scan our$error_list_filename_backup = 'error_list_dump_backup.txt'; our $error_statistic_filename = 'error_statistic.txt'; our $error_statistic_filename_only = 'error_statistic_only.txt'; our $error_statistic_filename_list = 'error_statistic_list.txt'; our $translation_file = 'translation.txt'; our $error_list_filename_30 = 'error_list_error_030.txt'; our $error_list_filename_every = 'error_list_error'; # for all errors our $error_geo_list_filename = 'error_geo_list.txt'; our $error_geo_list_filename_only = 'error_geo_list_only.txt'; our $error_geo_list_filename_html = 'error_geo_list.htm'; our $error_geo_list_filename_only_html = 'error_geo_list_only.htm'; our @inter_list = ( 'af', 'als', 'an', 'ar', 'bg', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy', 'gl', 'gv', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'ko', 'la', 'lb', 'lt', 'ms', 'nds', 'nds_nl', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sh', 'simple', 'sk', 'sl', 'sr', 'sv', 'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'vi', 'vo', 'yi', 'zh' ); our @foundation_projects = ( 'wikibooks', 'b', 'wiktionary', 'wikt', 'wikinews', 'n', 'wikiquote', 'q', 'wikisource', 's', 'wikipedia', 'w', 'wikispecies', 'species', 'wikimedia', 'foundation', 'wmf', 'wikiversity', 'v', 'commons', 'meta', 'metawikipedia', 'm', 'incubator', 'mw', 'quality', 'bugzilla', 'mediazilla', 'nost', 'testwiki' ); # current time get_time(); our $translation_page = ''; # name of the page with translation for example in de: "Wikipedia:WikiProject Check Wikipedia/Übersetzung" our $start_text = ''; $start_text = $start_text ."The WikiProject '''Check Wikipedia''' will help to clean up the syntax of Wikipedia and to find some other errors.\n"; $start_text = $start_text ."\n"; $start_text = $start_text ."'''Betatest''' - At the moment the script has some bugs and not every error on this page is an actual error. \n"; $start_text = $start_text ."\n"; our $description_text = ''; $description_text = $description_text ."== Project description in English == \n"; $description_text = $description_text ."* '''What is the goal of this project?'''\n"; $description_text = $description_text ."** This project should help to clean up the data of all articles in many different languages.\n"; $description_text = $description_text ."** If we have a clear and clean syntax in all articles more projects (for example: Wikipedia-DVD) can use our data more easily.\n"; $description_text = $description_text ."** The project was inspired by [[:en:Wikipedia:WikiProject Wiki Syntax]].\n"; $description_text = $description_text ."** In order to use the data of a Wikipedia project without the Mediawiki software you need to write a parser. If many articles include wrong syntax it is difficult to program the parser since it needs to be complex enough to recognize the syntax errors.\n"; $description_text = $description_text ."** This project helps to find many errors in all kinds of language and will support many languages in the future. \n"; $description_text = $description_text ."\n"; $description_text = $description_text ."* '''How does it work?'''\n"; $description_text = $description_text ."** The script scans every new [http://dumps.wikimedia.org dump] and creates a list of articles with errors.\n"; $description_text = $description_text ."** The script scans all articles on the list on a daily basis to create a new list for users, omitting already-corrected articles.\n"; $description_text = $description_text ."** The script is written in Perl by: [[:de:User:Stefan Kühn|Stefan Kühn]] "."\n"; $description_text = $description_text ."** You can download the script [http://toolserver.org/~sk/checkwiki/checkwiki.pl here]. It is licensed under GPL."."\n"; $description_text = $description_text ."** [[:de:User:Stefan Kühn/Check Wikipedia|New features, last changes and discussion]]. "."\n"; $description_text = $description_text ."\n"; $description_text = $description_text ."* '''What can you do?'''\n"; $description_text = $description_text ."** The script creates a new error page at the toolserver every day. Please copy and paste that [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_output_for_wikipedia.html page at the toolserver] to this page here. Attention: That page is a UTF-8 document. In case your browser cannot display the file in UTF-8 you can copy it into a text editor (for example: Notepad++) and convert it to UTF-8. \n"; $description_text = $description_text ."** You can fix an error in one or more articles. \n"; $description_text = $description_text ."** You can delete all fixed articles from this list. \n"; $description_text = $description_text ."** If all articles in one category have been fixed you can delete this category. \n"; $description_text = $description_text ."** You can suggest a new category of errors to the author of the script. \n"; $description_text = $description_text ."** You can also inform the author if you want this project to be implemented into your language's Wikipedia. \n"; $description_text = $description_text ."\n"; $description_text = $description_text ."* '''Please don't… '''\n"; $description_text = $description_text ."** insert an article by hand since it will disappear from the list with the next automatic update of this page. \n"; $description_text = $description_text ."** try to fix spelling mistakes within this page since all manual changes will disappear as well with the next update. Instead, send an e-mail or message to the author so he can fix the spelling in the script. \n"; $description_text = $description_text ."\n"; our $category_text = ''; our $top_priority_script = 'Top priority'; our $top_priority_project = ''; our $middle_priority_script = 'Middle priority'; our $middle_priority_project = ''; our $lowest_priority_script = 'Lowest priority'; our $lowest_priority_project = ''; } sub get_time{ our ($akSekunden, $akMinuten, $akStunden, $akMonatstag, $akMonat, $akJahr, $akWochentag, $akJahrestag, $akSommerzeit) = localtime(time); our $CTIME_String = localtime(time); $akMonat = $akMonat + 1; $akJahr = $akJahr + 1900; $akMonat = "0".$akMonat if ($akMonat<10); $akMonatstag = "0".$akMonatstag if ($akMonatstag<10); $akStunden = "0".$akStunden if ($akStunden<10); $akMinuten = "0".$akMinuten if ($akMinuten<10); } sub check_input_arguments{ ################################################################# # Declaration of parameters (extern) ################################################################# if ( @ARGV < 1) { # no parameters $quit_reason = $quit_reason. 'no parameters'."\n\n"; $quit_program = 'yes'; } ################### #check argument value for project my $found_argv = 'no'; foreach (@ARGV) { my $current_argv = $_; if ( index($current_argv, 'p=') == 0) { $found_argv = 'yes'; $project = $current_argv; $project =~ s/^p=//; $language = $project; $language =~ s/wiki//; } } if ($found_argv eq 'no'){ # no project name $quit_reason = $quit_reason. 'no project name, for example: "p=dewiki"'."\n\n"; $quit_program = 'yes'; } #################### #check argument value for scanmodus $found_argv = 'no'; foreach (@ARGV) { my $current_argv = $_; if ( $current_argv eq 'm=dump' or $current_argv eq 'm=live' or $current_argv eq 'm=only' ) { $found_argv = 'yes'; $dump_or_live = $current_argv; $dump_or_live =~ s/^m=//; } } if ($found_argv eq 'no'){ #no scan modus $quit_reason = $quit_reason. 'modus unknown, for example: "m=dump/live/only"'."\n\n"; $quit_program = 'yes'; } #################### #check argument value for silent or test $found_argv = 'no'; foreach (@ARGV) { my $current_argv = $_; $silent_modus = 'silent' if ( $current_argv eq 'silent' ); $test_modus = 'test' if ( $current_argv eq 'test'); } if ($quit_program eq 'yes'){ #End of Script, because no correct parameter $quit_reason = $quit_reason.'Use for scan a dump'."\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=dump'."\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump'."\n\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump silent'."\n\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump silent test'."\n\n"; $quit_reason = $quit_reason.'Use for scan a list of pages live'."\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live'."\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live silent'."\n"; $quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live silent test'."\n"; $quit_reason = $quit_reason."\n"; } else { # All parameters available and correct # extract parameters print "\n"; if ($silent_modus ne 'silent') { print '##################################################'."\n"; print '######## checkwiki.pl - Version 0.2 ########'."\n"; } print '##################################################'."\n"; print 'Start: '."\t\t".$akJahr.'-'.$akMonat.'-'.$akMonatstag.' '.$akStunden.':'.$akMinuten."\n"; print 'Project:'."\t\t". $project."\n"; if ($silent_modus ne 'silent') { print 'Modus: '."\t\t". $dump_or_live. ' ('; print 'scan a dump' if ($dump_or_live eq 'dump'); print 'scan live' if ($dump_or_live eq 'live'); print 'scan a dump only some errors' if ($dump_or_live eq 'only'); print ')'."\n"; } $project = $project.'_test' if ($test_modus eq 'test'); print "\t\t\t".'Test-Modus --> '.$project.'!!!'."\n" if ($test_modus eq 'test'); } } sub open_file{ # create subdirectory #print $output_directory.$project."\n"; if (not (-e $output_directory.$project )) { print 'create directory:'."\t". $output_directory.$project."\n"; mkdir($output_directory.$project ,0777); } ################################ # if new dump is available if ($dump_or_live eq 'dump') { $dump_filename = search_for_last_dump(); print 'Dump_filename:'."\t\t".$dump_filename."\n" if ($silent_modus ne 'silent'); my $last_dump_filename= $output_directory.$project.'/'.$project.'_last_dump_name.txt'; #print $last_dump_filename."\n"; if (not (-e $last_dump_filename)) { # create the file if not exist print 'create last_dump_file:'."\t".$project.'_last_dump_name.txt'."\n"; open (LAST_DUMP_NAME_FIRST, '>'.$last_dump_filename); print LAST_DUMP_NAME_FIRST 'x'; close(LAST_DUMP_NAME_FIRST); } #read the last name #print 'check old dumpname'."\n"; open (LAST_DUMP_NAME, '<'.$last_dump_filename); my $last_dump_name_old = ''; $last_dump_name_old = <LAST_DUMP_NAME>; #$last_dump_name_old = '' if not defined; $last_dump_name_old =~ s/\n//g; close(LAST_DUMP_NAME); if ($dump_filename ne $last_dump_name_old ) { # if not the newest dump then start dump scan print 'Last: '."\t\t". $last_dump_name_old."\n"; print 'Current: '."\t\t". $dump_filename."\n"; open (LAST_DUMP_NAME, '>'.$last_dump_filename); print LAST_DUMP_NAME $dump_filename; close(LAST_DUMP_NAME); #print 'nice -n 5 perl -w checkwiki.pl p='.$project.' m=dump' ."\n"; # if ($dump_or_live eq 'live') { # print "\n\n"; # system ('nice -n 5 perl -w checkwiki.pl p='.$project.' m=dump silent') ; # print "\n\n"; # } } } ################################ if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') { #print "lsat=x".$dump_filename."x\n"; #die; # check for existens dump if ($dump_filename ne '' and -e "$dump_directory$dump_filename") { #print 'Data: '."\t\t"."$dump_directory$dump_filename\n"; #open dump open(DUMP, "<$dump_directory/$dump_filename"); read_and_write_metadata_from_dump(); } else { $quit_program = 'yes'; $quit_reason = $quit_reason. "file '$dump_directory$dump_filename'". " don't exist!\n"; } # Templatetiger our $templatetiger_filename = $output_templatetiger.$project.'/'.$project.'_templatetiger.txt'; if (not (-e $output_templatetiger.$project )) { print 'create new subdirectory'."\t".'templatetiger'."\n"; mkdir($output_templatetiger.$project ,0777); } if (-e $templatetiger_filename ) { #print 'Delete '.$templatetiger_filename."\n"; system ('rm -f '.$templatetiger_filename) ; } #GEO Export our $geo_export_filename = $output_geo.$project.'/'.$project.'_coordinates.txt'; if (not (-e $output_geo.$project )) { print 'create new subdirectory'."\t".'geo'."\n"; mkdir($output_geo.$project ,0777); } if (-e $geo_export_filename ) { print 'Delete '.$geo_export_filename."\n"; system ('rm -f '.$geo_export_filename) ; } } if ($dump_or_live eq 'live' ) { # open list for live #print 'Data: '."\t\t".$output_directory.$project.'/'.$project.'_'.$error_list_filename ."\n"; if (not (-e $output_directory.$project.'/'.$project.'_'.$error_list_filename )){ $quit_program = 'yes'; $quit_reason = $quit_reason. "file:" .$output_directory.$project.'/'.$project.'_'.$error_list_filename. " don't exist!\n"; } else { #read articles(live) article_last_scan(); # get all article from last scan, where the script found errors new_article(); # get all new article last days last_change_article(); # get all new article last days geo_error_article(); # get all articles with geo errors last days article_with_error_from_dump_scan(); # get all articles error from the last dump scan # sort all articles (new + live) @live_article = sort(@live_article); $number_of_live_tests = @live_article; # delet all double/multi input article my @new_live_article; my @split_line; my @split_line_old; my $old_title = ''; my $all_errors_of_this_article = ''; my $i = -1; $number_of_live_tests = @live_article; foreach (@live_article) { @split_line_old = @split_line; @split_line = split(/\t/, $_); my $current_title = $split_line[0]; $split_line[1] =~ s/\n//; #print $current_title."\n"; my $number_of_split_line = @split_line; if ($number_of_split_line != 2) { print 'Problem with input line:'."\n"; print $_."\n"; die; }; if ($old_title ne $current_title and $old_title ne ''){ #save old $i = $i+1; $new_live_article[$i] = $old_title."\t".$all_errors_of_this_article; $all_errors_of_this_article = ''; #print "result:".$new_live_article[$i]."\n"; } # check new if ($old_title eq $current_title) { #double $all_errors_of_this_article = $all_errors_of_this_article.', '.$split_line[1]; #print 'double: '.$current_title."\t".$all_errors_of_this_article."\n"; } else { $all_errors_of_this_article = $split_line[1]; #print 'normal: '.$current_title."\t".$all_errors_of_this_article."\n"; } $old_title = $current_title; } #save last $i = $i+1; $new_live_article[$i] = $old_title."\t".$all_errors_of_this_article; @live_article = @new_live_article; $number_of_live_tests = @live_article; print 'articles without double'."\t".$number_of_live_tests."\n"; @new_live_article = (); # free memory @split_line = (); # free memory #foreach (@live_article) { # print $_."\n"; #} } } # delete old error_list if ($quit_program eq 'no' ) { read_and_write_metadata_from_dump(); load_metadata_from_file(); } } sub article_last_scan{ my $file_input_live = $output_directory.$project.'/'.$project.'_'.$error_list_filename; #print $file_input_live."\n"; open(LIVE, "<$file_input_live"); @live_article = <LIVE>; close (LIVE); $number_of_live_tests = @live_article; print 'articles last scan:'."\t".$number_of_live_tests."\n"; } sub new_article{ # Load new articles my $file_new = $project.'_new_article.txt'; my $file_input_new = $input_directory_new.$project.'/'.$file_new; #print $file_input_new."\n"; my $new_counter = 0; if (-e $file_input_new) { #if existing open(INPUT_NEW, "<$file_input_new"); do { my $line = <INPUT_NEW>; $line =~ s/\n$//g; my @split_line = split ( /\t/, $line); push(@live_article, $split_line[1]."\t".'0' ); #print $split_line[1]."\t".'0'."\n"; $new_counter ++; } until (eof(INPUT_NEW) == 1); close (INPUT_NEW); } print 'articles new:'."\t\t".$new_counter; print ' (no file: '.$file_new.' )' if not (-e $file_input_new); print "\n"; $for_statistic_new_article = $new_counter; } sub last_change_article{ # Load last change articles my $file_last_change = $project.'_last_changes.txt'; my $file_input_last_change = $input_directory_change.$project.'/'.$file_last_change; #print $file_input_new."\n"; my $change_counter = 0; if (-e $file_input_last_change) { #if existing #print 'file exist'."\n"; open(INPUT_NEW, "<$file_input_last_change"); do { my $line = <INPUT_NEW>; if ($line) { $line =~ s/\n$//g; my @split_line = split ( /\t/, $line); push(@live_article, $split_line[1]."\t".'0' ); $change_counter ++; } } until (eof(INPUT_NEW) == 1); close (INPUT_NEW); } print 'articles change:'."\t".$change_counter; print ' (no file: '.$file_last_change.' )' if not (-e $file_input_last_change); print "\n"; our $for_statistic_last_change_article = $change_counter; } sub geo_error_article{ # get all last_change article last days # Load last change articles my $file_geo = $project.'_'.$error_geo_list_filename; my $file_input_geo = $output_geo.$project.'/'.$file_geo; #print $file_input_new."\n"; my $geo_counter = 0; if (-e $file_input_geo) { #if existing #print 'file exist'."\n"; open(INPUT_GEO, "<$file_input_geo"); do { my $line = <INPUT_GEO>; if ($line) { $line =~ s/\n$//g; my @split_line = split ( /\t/, $line); my $number_of_parts = @split_line; if ( $number_of_parts > 0 ) { push(@live_article, $split_line[0]."\t".'0' ); $geo_counter ++; } } } until (eof(INPUT_GEO) == 1); close (INPUT_GEO); } print 'articles geo:'."\t\t".$geo_counter; print ' (no file: '.$file_geo.' )' if not (-e $file_input_geo); print "\n"; $for_statistic_geo_article = $geo_counter; } sub article_with_error_from_dump_scan{ if ( $dump_or_live eq 'live') { # if a new dump is available my $input_dump_errors = $output_directory.$project.'/'.$project.'_'.$error_list_filename_dump; #print $file_input_new."\n"; my $dump_counter = 0; if (-e $input_dump_errors) { #if existing #print 'file exist'."\n"; open(INPUT_DUMP, "<$input_dump_errors"); do { my $line = <INPUT_DUMP>; if ($line) { $line =~ s/\n$//g; my @split_line = split ( /\t/, $line); my $number_of_parts = @split_line; if ( $number_of_parts > 0 ) { push(@live_article, $split_line[0]."\t".$split_line[1] ); $dump_counter ++; } } } until (eof(INPUT_DUMP) == 1); close (INPUT_DUMP); # delete system ('rm '.$input_dump_errors); } print 'articles dump:'."\t\t".$dump_counter."\n"; } } sub search_for_last_dump { # search in dump_directory for the last XML-file of a project my $last_file =''; my @xml_files = glob($dump_directory.'*.xml'); my $count_xml_files = @xml_files; for (my $i = 0; $i < $count_xml_files; $i++) { # List of all xml-files in dump_directory my $byte = -s $xml_files[$i]; #print $xml_files[$i].' '.$byte."\n"; $xml_files[$i] =~ s/(.)+\///g; my $project_test = $project; $project_test =~ s/_test$//; if (( index($xml_files[$i], $project.'-') == 0 # only this project or index($xml_files[$i], $project_test.'-') == 0 ) # and $byte > 0 ) { # only more then 0 bytes files #the last project dump (more then 0 byte) #print "\t".$xml_files[$i]."\n"; $last_file = $xml_files[$i]; } } if ($last_file eq '' and $dump_or_live ne 'live') { # stop if dump scan , run if the program will scan live # No file found $quit_program = 'yes'; $quit_reason = $quit_reason.$count_xml_files.' XML-files found in folder '.$dump_directory."\n"; $quit_reason = $quit_reason.'Found no XML-file for project: '.$project."\n"; } @xml_files = (); # free memory return($last_file); } ############################################################################ sub scan_pages{ # get the text of the next page print 'Start scanning'."\n" if ($silent_modus ne 'silent'); our $end_of_dump = 'no'; # when last article from dump scan then 'yes', else 'no' our $end_of_live = 'no'; # when last article from live scan then 'yes', else 'no' do { set_variables_for_article(); if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') { get_next_page_from_dump(); } else { get_next_page_from_live(); } if ( $end_of_dump eq 'no' and $end_of_live eq 'no' and not ( $title =~ /\.js$/ or $title =~ /\.css$/ ) ) { check_article(); #Main check routine } else { if ( $end_of_dump eq 'yes' or $end_of_live eq 'yes' ) { print 'articles scan finish'."\n\n" if ($silent_modus ne 'silent'); } else { print 'no check in article:'."\t\t".$title."\n"; } } } until ( $end_of_dump eq 'yes' or $end_of_live eq 'yes' #or $page_number > 20 #or $page_id > 7950 #or ($error_counter > 10000 and $project ne 'dewiki') or ($error_counter > 40000 and $dump_or_live eq 'live') ); } sub set_variables_for_article { $page_number = $page_number + 1; our $title = ''; # title of the current article our $page_id = -1; # page id of the current article our $revision_id = -1; # revision id of the current article our $revision_time = -1; # revision time of the current article our $text = ''; # text of the current article our $page_namespace = -100; # namespace of page our $page_is_redirect = 'no'; our $page_is_disambiguation = 'no'; our $page_categories = ''; our $page_interwikis = ''; our $page_has_error = 'no'; # yes/no error in this page our $page_error_number = -1; # number of all article for this page our @comments; # 0 pos_start # 1 pos_end # 2 comment our $comment_counter = -1; #number of comments in this page our @category; # 0 pos_start # 1 pos_end # 2 category Test # 3 linkname Linkname # 4 original [[Category:Test|Linkname]] our $category_counter = -1; our $category_all = ''; # all categries our @interwiki; # 0 pos_start # 1 pos_end # 2 interwiki Test # 3 linkname Linkname # 4 original [[de:Test|Linkname]] # 5 language our $interwiki_counter = -1; our @lines; # text seperated in lines our @headlines; # headlines our @section; # text between headlines undef(@section); our @lines_first_blank; # all lines where the first character is ' ' our @templates_all; # all templates our @templates; # templates with values # 0 number of template # 1 templatename # 2 template_row # 3 attribut # 4 value our $number_of_template_parts = -1; # number of all template parts our @links_all; # all links our @images_all; # all images our @isbn; # all ibsn of books our @ref; # all ref our $page_has_geo_error = 'no'; # yes/no geo error in this page our $page_geo_error_number = -1; # number of all article for this page our $details_for_page = 'no'; # yes/no durring the scan you can get more details for a article scan } sub close_file { #close all open files close (DUMP); } sub read_and_write_metadata_from_dump { # read the metadata from dump (<xml … <siteinfo>…</siteinfo>) # write this metadata in file for dump and live-scan #print 'Read metadata from dump and write in file'."\n"; #old from dump # my $line =''; # my $end = 'no'; my $metadata = ''; # do { # $line_number = $line_number + 1; # $line = <DUMP>; # #print $line_number.' '.$line; # $line =~ s/\n//; # $metadata = $metadata.$line."\n"; # if (index ($line, '</siteinfo>') > -1) { # $end = 'yes'; # } # # } # until ( $end eq 'yes'); #new from web # raw_text2 #print 'get Metadaten from :'.$project.' '.$language."\n"; $language = 'nds-nl' if ($project eq 'nds_nlwiki'); my $url = 'http://'.$language.'.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|statistics|magicwords&format=xml'; if ($project eq 'commonswiki') { $url = 'http://commons.wikimedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|statistics|magicwords&format=xml'; } $metadata = raw_text2($url); $language = 'nds_nl' if ($project eq 'nds_nlwiki'); my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt'; #print $file_metadata."\n"; open(METADATA, ">$file_metadata"); print METADATA $metadata; close(METADATA); $metadata = ''; } sub load_metadata_from_file { # load metadata from file for dump and live # this file is from the last dump (if live) or current dump (if dump) #print 'Read metadata from file'."\n"; my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt'; open(METADATA, "<$file_metadata"); my @metadata = <METADATA>; close(METADATA); my $metatext = ''; foreach (@metadata) { $metatext = $metatext.$_; } #print $metatext."\n"; #Extract metadata #sitename my $sitename = ''; my $pos1 = index($metatext,'sitename="') + length('sitename="'); my $pos2 = index($metatext,'"', $pos1); $sitename = substr($metatext, $pos1, $pos2 - $pos1); print 'Sitename: '."\t\t".$sitename."\n" if ($silent_modus ne 'silent'); #base $base = ''; $pos1 = index($metatext,'base="') + length('base="'); $pos2 = index($metatext,'"', $pos1 ); $base = substr($metatext, $pos1, $pos2 -$pos1); print 'Base: '."\t\t".$base."\n" if ($silent_modus ne 'silent'); $home = $base; $home =~ s/[^\/]+$//; #print 'Home: '."\t\t".$home."\n"; #namespace my $namespaces = ''; $pos1 = index($metatext,'<namespaces>') + length('<namespaces>'); $pos2 = index($metatext,'</namespaces>', $pos1); $namespaces = substr($metatext, $pos1, $pos2 -$pos1); #print "x".$namespaces."x\n"; #$namespaces =~ s/^\n//g; $namespaces =~ s/<\/ns>/\n/g; $namespaces =~ s/" subpages="" //g; $namespaces =~ s/<ns id="//g; $namespaces =~ s/" canonical="/\t/g; $namespaces =~ s/canonical="/\t/g; $namespaces =~ s/">/\t/g; $namespaces =~ s/" \/>/\t\n/g; #$namespaces =~ s/ //g; #print "x".$namespaces."x\n"; my @namespaces_split = split( /\n/, $namespaces); $namespaces_count = @namespaces_split; #print $namespaces_count; for (my $i = 0; $i < $namespaces_count; $i++) { #print $i."\t".$namespaces_split[$i]."\n"; my @splitter = split( /\t/, $namespaces_split[$i]); $namespace[$i][0] = int($splitter[0]); $namespace[$i][1] = $splitter[2]; $namespace[$i][1] = '' if ($namespace[$i][0] == 0); $namespace[$i][2] = $splitter[1]; $namespace[$i][2] = '' if ($namespace[$i][0] == 0); if ($namespace[$i][0] == 6) { # image $namespace_image[0] = $namespace[$i][1]; $namespace_image[1] = $namespace[$i][2]; } if ($namespace[$i][0] == 10) { # templates $namespace_templates[0] = $namespace[$i][1]; $namespace_templates[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]); } if ($namespace[$i][0] == 14) { #category $namespace_cat[0] = $namespace[$i][1]; $namespace_cat[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]); } } # namespacealiases my $namespacealiases_text = ''; $pos1 = index($metatext,'<namespacealiases>') + length('<namespacealiases>'); $pos2 = index($metatext,'</namespacealiases>', $pos1); $namespacealiases_text = substr($metatext, $pos1, $pos2 -$pos1); #print $namespacealiases_text. "\n"; $namespacealiases_text =~ s/<\/ns>/\n/g; $namespacealiases_text =~ s/<ns id="//g; $namespacealiases_text =~ s/">/\t/g; #print $namespacealiases_text. "\n"; my @namespacealiases_split = split( /\n/, $namespacealiases_text); $namespacealiases_count = @namespacealiases_split; #print $namespaces_count; for (my $i = 0; $i < $namespacealiases_count; $i++) { my @splitter = split( /\t/, $namespacealiases_split[$i]); if ($splitter[0] eq '6') { #aliasname for image push(@namespace_image, $splitter[1]); } if ($splitter[0] eq '10') { #aliasname for templates push(@namespace_templates, $splitter[1]); } if ($splitter[0] eq '14') { #aliasname for category push(@namespace_cat, $splitter[1]); } #save all aliases $namespacealiases[$i][0] = $splitter[0]; $namespacealiases[$i][1] = $splitter[1]; #print 'Namespacealiases: '.$namespacealiases[$i][0].','.$namespacealiases[$i][1]."\n"; } #foreach (@namespace_image) { # print $_."\n"; #} #print "\n"; #foreach (@namespace_cat) { # print $_."\n"; #} #magicwords @magicword_defaultsort = get_magicword($metatext, 'defaultsort'); @magicword_img_thumbnail = get_magicword($metatext, 'img_thumbnail'); @magicword_img_manualthumb = get_magicword($metatext, 'img_manualthumb'); @magicword_img_right = get_magicword($metatext, 'img_right'); @magicword_img_left = get_magicword($metatext, 'img_left'); @magicword_img_none = get_magicword($metatext, 'img_none'); @magicword_img_center = get_magicword($metatext, 'img_center'); @magicword_img_framed = get_magicword($metatext, 'img_framed'); @magicword_img_frameless = get_magicword($metatext, 'img_frameless'); @magicword_img_page = get_magicword($metatext, 'img_page'); @magicword_img_upright = get_magicword($metatext, 'img_upright'); @magicword_img_border = get_magicword($metatext, 'img_border'); @magicword_img_sub = get_magicword($metatext, 'img_sub'); @magicword_img_super = get_magicword($metatext, 'img_super'); @magicword_img_link = get_magicword($metatext, 'img_link'); @magicword_img_alt = get_magicword($metatext, 'img_alt'); @magicword_img_width = get_magicword($metatext, 'img_width'); @magicword_img_baseline = get_magicword($metatext, 'img_baseline'); @magicword_img_top = get_magicword($metatext, 'img_top'); @magicword_img_text_top = get_magicword($metatext, 'img_text_top'); @magicword_img_middle = get_magicword($metatext, 'img_middle'); @magicword_img_bottom = get_magicword($metatext, 'img_bottom'); @magicword_img_text_bottom = get_magicword($metatext, 'img_text_bottom'); #foreach (@magicword_defaultsort) { # print $_."\n"; #} #die; } sub get_magicword { my $metatext = $_[0]; my $key = $_[1]; my @result; my $pos1 = index( $metatext, '<magicword name="'.$key ); if ($pos1 > -1) { my $pos2 = index( $metatext, '</magicword>', $pos1 ); my $part = substr ($metatext, $pos1, $pos2 + length('</magicword>') - $pos1); #print $part."\n"; my @part_split = split ( '<alias>', $part ); shift (@part_split); foreach (@part_split) { #print $_."\n" my $pos3 = index ($_, '</alias>'); my $alias = substr ($_, 0, $pos3); #print $alias ."\n"; push (@result, $alias ); } return(@result); } } sub get_next_page_from_dump{ #this function scan line after line from dump, #the result is the text from the next article my $line = ""; # one line in dump my $article_complete = 0; # all line of article (then 1) my $start_recording = 0; # find <page> my $revision_start = 0; # find <revision> #loop for every line do { $line = <DUMP>; $line_number = $line_number +1; #$number_of_scan_line = $number_of_scan_line +1; #Security, maybe the finish is not correct #print "$line"; if ($line =~ /<page>/) { $start_recording = 1; } if ($start_recording == 1) { $text = $text.$line; } if ($line =~ /<\/page>/) { $start_recording = 0; $article_complete = 1; } if ($line =~ /<title>/) { #extract title $title ="$line"; my @content= split(/>/,$title); @content= split(/</,$content[1]); $title=$content[0]; #print "$title\n"; } if ($line =~ /<id>/ and $page_id == -1 ) { #extract id $page_id ="$line"; my @content= split(/>/,$page_id); @content= split(/</,$content[1]); $page_id = $content[0]; #print "$page_id\t$title\n"; } if ($line =~ /<revision>/) { $revision_start = 1; } if ($revision_start == 1 and $revision_id == -1 and $line =~ /<id>/) { #read revision_id $revision_id ="$line"; my @content= split(/>/,$revision_id); @content= split(/</,$content[1]); $revision_id=$content[0]; #print $revision_id,"\n"; } if ($revision_start == 1 and $line =~ /<timestamp>/) { #read revision_id $revision_time ="$line"; my @content= split(/>/,$revision_time); @content= split(/</,$content[1]); $revision_time=$content[0]; #print $revision_time,"\n"; } $end_of_dump = 'yes' if ($line =~ /<\/mediawiki>/); $end_of_dump = 'yes' if (eof(DUMP) == 1); } until ( $article_complete == 1 or $end_of_dump eq 'yes'); #Extract only edit-text my $test = index ($text, '<text xml:space="preserve">'); $text = substr($text, $test); $text =~ s/<text xml:space="preserve">//g; $test = index($text, '</text>'); $text = substr($text,0,$test); $text = replace_special_letters($text); #if ( $title eq 'At-Tabarī' # or $title eq 'Rumänien' # or $title eq 'Liste der Ortsteile im Saarland') { # my $output_article_text_file = $output_directory.$project.'/'.$project.'_text_article_'.$title.'.txt'; # open(OUTPUT_ARTICLE_TEXT, ">$output_article_text_file"); # print OUTPUT_ARTICLE_TEXT $text; # close(OUTPUT_ARTICLE_TEXT); #} #print $text; } sub get_next_page_from_live { $current_live_article ++; #next article # if ($current_live_error_scan == -1) { # first run # get all error 0 and new/change #print 'Error 0 (new, change, geo) and last scan '."\n"; # $current_live_article = 0; # $current_live_error_scan = 1; # get_all_error_with_number($current_live_error_scan); # $number_article_live_to_scan = @live_to_scan; #print 'Error 0 :'."\t".$number_article_live_to_scan."\n"; # $maximum_current_error_scan = $max_error_count; # } if ( $current_live_error_scan != 0 ) { # Error not 0 if ($current_live_error_scan != 0 and $current_live_article == $maximum_current_error_scan) { # set number higher if not all 100 errors found #print 'Nr.'.$current_live_error_scan."\n"; #print 'Found at moment :'.$error_description[$current_live_error_scan][3]."\n"; #print 'Max allowed:'.$max_error_count."\n"; #print 'Max possible:'.$number_article_live_to_scan."\n"; if ( $error_description[$current_live_error_scan][3] < $max_error_count ) { # set higer maximum $maximum_current_error_scan = $maximum_current_error_scan + ($max_error_count - $error_description[$current_live_error_scan][3]); #print 'Set higher maximum: '.$maximum_current_error_scan."\n"; } else { # stop scan save_errors_for_next_scan($current_live_article); #$rest_of_errors_not_scan_yet $current_live_article = -1; } } # find next error with articles # if ($current_live_article == $number_article_live_to_scan) { # # end of error # print 'switch after end of error_list'."\n"; # $current_live_article = -1; # } # find next error with articles if (($current_live_error_scan > 0 and $current_live_article == -1) or $current_live_article == $number_article_live_to_scan or $current_live_error_scan == -1) { #print 'switch from error to error'."\n"; $current_live_error_scan = 0 if ($current_live_error_scan == -1); #start with error 1 do { $current_live_error_scan ++; #print $current_live_error_scan."\n"; @live_to_scan = (); if ($error_description[$current_live_error_scan][3] < $max_error_count) { # only if not all found with new/change/last get_all_error_with_number($current_live_error_scan); } else { # if with new /change etc. we found for this error much get_all_error_with_number($current_live_error_scan); save_errors_for_next_scan(0); @live_to_scan = (); } $number_article_live_to_scan = @live_to_scan; } until ($current_live_error_scan >= $number_of_max_errors or $number_article_live_to_scan > 0); $maximum_current_error_scan = $max_error_count; if ($error_description[$current_live_error_scan][3] > 0) { #print 'More errors for error'.$current_live_error_scan."\n"; #print 'At moment only :'.$error_description[$current_live_error_scan][3]."\n"; $maximum_current_error_scan = $max_error_count - $error_description[$current_live_error_scan][3]; #print 'Search now for more :'.$maximum_current_error_scan."\n"; } $current_live_article = 0; #print '#############################################################'."\n"; #print 'Error '.$current_live_error_scan.' :'."\t".$number_article_live_to_scan."\n" if ($number_article_live_to_scan > 0); #print 'Max='.$maximum_current_error_scan."\n"; #print 'Available = '.$number_article_live_to_scan."\n"; } } if ( $current_live_error_scan == 0 and $current_live_article >= $number_article_live_to_scan ) { $end_of_live = 'yes'; # end of live } if ($current_live_error_scan >= $number_of_max_errors) { $current_live_article = 0; $current_live_error_scan = 0; get_all_error_with_number($current_live_error_scan); $number_article_live_to_scan = @live_to_scan; #print 'Error 0 :'."\t".$number_article_live_to_scan."\n"; $maximum_current_error_scan = $max_error_count; } #$number_article_live_to_scan = @live_to_scan; if ( $current_live_article < $number_article_live_to_scan and $number_article_live_to_scan > 0 and $end_of_live ne 'yes' ) { my $line = $live_to_scan[$current_live_article]; #print '1:'.$line."\n"; my @line_split = split( /\t/, $line); # get text $title = $line_split[0]; $text = raw_text($title); push(@article_was_scanned, $title); # text my $test = index ($text, '<rev timestamp="'); my $pos = index ($text,'">', $test ); $text = substr($text, $pos + 2); #$text =~ s/<text xml:space="preserve">//g; $test = index($text,'</rev>'); $text = substr($text,0,$test); #id #revision_id #revision_time #print substr($text, 0, 60)."\n"; $text = replace_special_letters($text); } } sub save_errors_for_next_scan { my $from_number = $_[0]; $number_article_live_to_scan = @live_to_scan; for (my $i = $from_number; $i < $number_article_live_to_scan; $i++) { #print $live_to_scan[$i]."\n"; my $line = $live_to_scan[$i]; #print '1:'.$line."\n"; my @line_split = split( /\t/, $line); my $rest_title = $line_split[0]; $rest_of_errors_not_scan_yet = $rest_of_errors_not_scan_yet."\n".$rest_title."\t".$current_live_error_scan; } } sub get_all_error_with_number { # get from array "live_article" with all errors, only this errors with error number X my $error_live = $_[0]; #print 'Error number: '.$error_live."\n"; my $number_of_article = @live_article; for ($i = 0; $i < $number_of_article; $i ++) { my $current_live_line = $live_article[$i]; #print $current_live_line."\n"; @line_split = split( /\t/, $current_live_line); #print 'alle:'.$line_split[1]."\n" if ($error_live == 0); my @split_error = split( ', ',$line_split[1]); my $found = 'no'; foreach (@split_error) { if ( $error_live eq $_ ){ #found error with number X $found = 'yes'; #print $current_live_line."\n" if ($error_live == 0); } } if ($found eq 'yes') { # article has error X #print 'found '.$current_live_line."\n" if ($error_live == 7); # was this article scanned today ? $found = 'no'; my $number_of_scanned_articles = @article_was_scanned; #print 'Scanned: '."\t".$number_of_scanned_articles."\n"; foreach (@article_was_scanned) { #print $_."\n"; if ( index ($current_live_line, $_."\t") == 0) { #article was in this run scanned $found = 'yes'; #print 'Was scanned :'."\t".$current_live_line."\n"; } } if ($found eq 'no') { push(@live_to_scan, $current_live_line); #."\t".$i } } } } sub get_all_error_with_type { # at the moment not in use # get from all error, only this errors with number X my $error_type = $_[0]; my $number_of_article = @live_article; for ($i = 0; $i < $number_of_article; $i ++) { my $current_live_line = $live_article[$i]; @line_split = split( /\t/, $current_live_line); if ( $line_split[1] eq $error_type) { # $live_article[$i] =~ s/\tD\t/\tL\t/; # $live_article[$i] =~ s/\tO\t/\tL\t/; push(@live_to_scan, $current_live_line); #."\t".$i } } } sub replace_special_letters { my $content = $_[0]; # only in dump must replace not in live # http://de.wikipedia.org/w/index.php?title=Benutzer_Diskussion:Stefan_K%C3%BChn&oldid=48573921#Dump $content =~ s/</</g; $content =~ s/>/>/g; $content =~ s/"/"/g; $content =~ s/'/'/g; $content =~ s/&/&/g; # < -> < # > -> > # " -> " # ' -> ' # & -> & return ($content); } sub raw_text { my $title = $_[0]; $title =~ s/&/%26/g; # Problem with & in title $title =~ s/'/'/g; # Problem with apostroph in title $title =~ s/</</g; $title =~ s/>/>/g; $title =~ s/"/"/g; $title =~ s/'/'/g; # http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=$lang.wikipedia.org&title=$article my $url2 = ''; #$url2 = 'http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=de.wikipedia.org&title='.$title; $url2 = $home; $url2 =~ s/\/wiki\//\/w\//; # old $url2 = $url2.'index.php?title='.$title.'&action=raw'; $url2 = $url2.'api.php?action=query&prop=revisions&titles='.$title.'&rvprop=timestamp|content&format=xml'; #print $url2."\n"; my $response2 ; #do { uri_escape($url2); #print $url2."\n"; #uri_escape( join ' ' => @ARGV ); my $ua2 = LWP::UserAgent->new; $response2 = $ua2->get( $url2 ); #} #until ($response2->is_success); my $content2 = $response2->content; my $result2 = ''; $result2 = $content2 if ($content2) ; return($result2); } sub raw_text2 { my $url = $_[0]; $url =~ s/&/%26/g; # Problem with & in title $url =~ s/'/'/g; # Problem with apostroph in title my $response2 ; uri_escape($url); my $ua2 = LWP::UserAgent->new; $response2 = $ua2->get( $url ); my $content2 = $response2->content; my $result2 = ''; $result2 = $content2 if ($content2) ; return($result2); } sub text_translation_input{ print 'Load tanslation of:'."\t".$project."\n" if ($silent_modus ne 'silent'); # Input of translation page $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'afwiki') ; $translation_page = 'ويكيبيديا:فحص_ويكيبيديا/ترجمة' if ($project eq 'arwiki') ; $translation_page = 'Viquipèdia:WikiProject Check Wikipedia/Translation' if ($project eq 'cawiki') ; $translation_page = 'Wikipedie:WikiProjekt Check Wikipedia/Translation' if ($project eq 'cswiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'commonswiki') ; $translation_page = 'Wicipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'cywiki') ; $translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Oversættelse' if ($project eq 'dawiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Übersetzung' if ($project eq 'dewiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'enwiki') ; $translation_page = 'Vikipedio:WikiProjekt Check Wikipedia/Translation' if ($project eq 'eowiki') ; $translation_page = 'Wikiproyecto:Check Wikipedia/Translation' if ($project eq 'eswiki') ; $translation_page = 'Wikipedia:Wikiprojekti Check Wikipedia/Translation' if ($project eq 'fiwiki') ; $translation_page = 'Projet:Correction syntaxique/Traduction' if ($project eq 'frwiki') ; $translation_page = 'Wikipedy:WikiProject Check Wikipedia/Translation' if ($project eq 'fywiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'hewiki') ; $translation_page = 'Wikipédia:Ellenőrzőműhely/Fordítás' if ($project eq 'huwiki') ; $translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'idwiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'iswiki') ; $translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'itwiki') ; $translation_page = 'Wikipedia:ウィキプロジェクト ウィキ文法のチェック/Translation' if ($project eq 'jawiki') ; $translation_page = 'Vicipaedia:WikiProject Check Wikipedia/Translation' if ($project eq 'lawiki') ; $translation_page = 'Wikipedia:Wikiproject Check Wikipedia/Translation' if ($project eq 'ndswiki') ; $translation_page = 'Wikipedie:WikiProject Check Wikipedia/Translation' if ($project eq 'nds_nlwiki') ; $translation_page = 'Wikipedia:Wikiproject/Check Wikipedia/Vertaling' if ($project eq 'nlwiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'nowiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'pdcwiki') ; $translation_page = 'Wikiprojekt:Check Wikipedia/Tłumaczenie' if ($project eq 'plwiki') ; $translation_page = 'Wikipedia:Projetos/Check Wikipedia/Tradução' if ($project eq 'ptwiki') ; $translation_page = 'Википедия:Страницы с ошибками в викитексте/Перевод' if ($project eq 'ruwiki') ; $translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'rowiki') ; $translation_page = 'Wikipédia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'skwiki') ; $translation_page = 'Wikipedia:Projekt wikifiering/Syntaxfel/Translation' if ($project eq 'svwiki') ; $translation_page = 'Vikipedi:Vikipedi proje kontrolü/Çeviri' if ($project eq 'trwiki') ; $translation_page = 'Вікіпедія:Проект:Check Wikipedia/Translation' if ($project eq 'ukwiki') ; $translation_page = 'װיקיפּעדיע:קאנטראלירן_בלעטער/Translation' if ($project eq 'yiwiki') ; $translation_page = '维基百科:专题/错误检查/翻译' if ($project eq 'zhwiki') ; my $translation_input = raw_text($translation_page); $translation_input = replace_special_letters($translation_input); #print $translation_input."\n"; my $input_text =''; # start_text $input_text = get_translation_text($translation_input, 'start_text_'.$project.'=', 'END'); $start_text = $input_text if ($input_text ne ''); # description_text $input_text = get_translation_text($translation_input, 'description_text_'.$project.'=', 'END'); $description_text = $input_text if ($input_text ne ''); # category_text $input_text = get_translation_text($translation_input, 'category_001=', 'END' ); $category_text = $input_text if ($input_text ne ''); # priority $input_text = get_translation_text($translation_input, 'top_priority_'.$project.'=', 'END' ); $top_priority_project = $input_text if ($input_text ne ''); $input_text = get_translation_text($translation_input, 'middle_priority_'.$project.'=', 'END' ); $middle_priority_project = $input_text if ($input_text ne ''); $input_text = get_translation_text($translation_input, 'lowest_priority_'.$project.'=', 'END' ); $lowest_priority_project = $input_text if ($input_text ne ''); # find error description for (my $i = 0; $i < $number_of_max_errors; $i++) { my $current_error_number = 'error_'; $current_error_number = $current_error_number.'0' if ($i < 10); $current_error_number = $current_error_number.'0' if ($i < 100); $current_error_number = $current_error_number.$i; #print $i, $current_error_number."\n"; # Priority $error_description[$i][4] = get_translation_text($translation_input, $current_error_number.'_prio_'.$project.'=', 'END'); #print "x".$error_description[$i][4]."x"."\n"; if ($error_description[$i][4] ne '') { $error_description[$i][4] = int ($error_description[$i][4]); } else { $error_description[$i][4] = -1; } #print $error_description[$i][4]."\n"; $error_description[$i][5] = get_translation_text($translation_input, $current_error_number.'_head_'.$project.'=', 'END'); $error_description[$i][6] = get_translation_text($translation_input, $current_error_number.'_desc_'.$project.'=', 'END'); } } sub get_translation_text { my $translation_text = $_[0]; my $start_tag = $_[1]; my $end_tag =$_[2]; my $pos_1 = index($translation_text, $start_tag); my $pos_2 = index($translation_text, $end_tag, $pos_1); my $result = ''; if ($pos_1 > -1 and $pos_2 > 0) { $result = substr($translation_text, $pos_1, $pos_2 -$pos_1); #print $result."\n"; $result = substr($result, index ($result, '=')+1); $result =~ s/^ //g; $result =~ s/ $//g; } return ($result); } sub text_translation_output{ # Output of translation-file my $filename = $output_directory.$project.'/'.$project.'_'.$translation_file; print 'Output translation:'."\t".$project.'_'.$translation_file."\n" if ($silent_modus ne 'silent'); open(TRANSLATION, ">$filename"); ####################################### print TRANSLATION '<pre>'."\n"; print TRANSLATION ' new translation text under http://toolserver.org/~sk/checkwiki/'.$project.'/'. " (updated daily) \n"; print TRANSLATION '#########################'."\n"; print TRANSLATION '# metadata'."\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION ' project='.$project." END\n"; print TRANSLATION ' category_001='.$category_text." END #for example: [[Category:Wikipedia]] \n"; print TRANSLATION "\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION '# start text'."\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION "\n"; print TRANSLATION ' start_text_'.$project.'='.$start_text." END\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION '# description'."\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION "\n"; print TRANSLATION ' description_text_'.$project.'='.$description_text." END\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION '# priority'."\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION "\n"; print TRANSLATION ' top_priority_script='.$top_priority_script." END\n"; print TRANSLATION ' top_priority_'.$project.'='.$top_priority_project." END\n"; print TRANSLATION ' middle_priority_script='.$middle_priority_script." END\n"; print TRANSLATION ' middle_priority_'.$project.'='.$middle_priority_project." END\n"; print TRANSLATION ' lowest_priority_script='.$lowest_priority_script." END\n"; print TRANSLATION ' lowest_priority_'.$project.'='.$lowest_priority_project." END\n"; print TRANSLATION "\n"; print TRANSLATION " Please only translate the variables with …_".$project." at the end of the name. Not …_script= .\n"; ######################################## my $number_of_error_description = 1; while ($error_description[$number_of_error_description][1] ne '') { #print $number_of_error_description.' '. $error_description[$number_of_error_description][1]."\n"; $number_of_error_description = $number_of_error_description + 1; } #until ($error_description[$number_of_error_description][1] ne ''); # english Headline existed print 'error description:'."\t".$number_of_error_description." (-1) \n" if ($silent_modus ne 'silent'); print TRANSLATION '#########################'."\n"; print TRANSLATION '# error description'."\n"; print TRANSLATION '#########################'."\n"; print TRANSLATION '# prio = -1 (unknown)'."\n"; print TRANSLATION '# prio = 0 (deactivated) '."\n"; print TRANSLATION '# prio = 1 (top priority)'."\n"; print TRANSLATION '# prio = 2 (middle priority)'."\n"; print TRANSLATION '# prio = 3 (lowest priority)'."\n"; print TRANSLATION "\n"; for (my $i = 1; $i < $number_of_error_description; $i++) { my $current_error_number = 'error_'; $current_error_number = $current_error_number.'0' if ($i < 10); $current_error_number = $current_error_number.'0'.$i if ($i < 100); print TRANSLATION ' '.$current_error_number.'_prio_script='.$error_description[$i][0]." END\n"; print TRANSLATION ' '.$current_error_number.'_head_script='.$error_description[$i][1]." END\n"; print TRANSLATION ' '.$current_error_number.'_desc_script='.$error_description[$i][2]." END\n"; print TRANSLATION ' '.$current_error_number.'_prio_'.$project.'='.$error_description[$i][4]." END\n"; print TRANSLATION ' '.$current_error_number.'_head_'.$project.'='.$error_description[$i][5]." END\n"; print TRANSLATION ' '.$current_error_number.'_desc_'.$project.'='.$error_description[$i][6]." END\n"; print TRANSLATION "\n"; print TRANSLATION '###########################################################################'."\n"; print TRANSLATION "\n"; } print TRANSLATION '</pre>'."\n"; close(TRANSLATION); } sub output_errors{ #output all errors ########################################## # output for next live print 'errors found:'."\t\t".$error_counter." (+1)\n"; ############### # more errors in one article my $output_list_live =''; my $number_of_articles_with_error = -1; for (my $i = 0; $i <= $error_counter; $i++) { $output_list_live = $output_list_live.$page_with_error[$i][1]."\t".$page_with_error[$i][0]."\n"; #if ($i >30) { # print $output_list_live ."\n"; # die; #} } $output_list_live =~ s/\n$//; #last break # all found errors + all errors from last scan, wich not scanned now $output_list_live = $output_list_live.$rest_of_errors_not_scan_yet if ($rest_of_errors_not_scan_yet ne ''); my @output_list; @output_list = split ( /\n/, $output_list_live); my @output_list_sort = sort(@output_list); @output_list = @output_list_sort; @output_list_sort = (); # delet all double/multi input article my @new_output_list; my @split_line; my @split_line_old; my $old_title = ''; my $all_errors_of_this_article = ''; my $i = -1; my $number_of_output_list = @output_list; #print "Line number: ".$number_of_output_list." (before sort)\n"; if ($number_of_output_list > 0) { foreach (@output_list) { @split_line_old = @split_line; @split_line = split(/\t/, $_); my $current_title = $split_line[0]; $split_line[1] =~ s/\n//; #print $current_title."\n"; my $number_of_split_line = @split_line; if ($old_title ne $current_title and $old_title ne ''){ #save old $i = $i+1; $new_output_list[$i] = $old_title."\t".$all_errors_of_this_article; $all_errors_of_this_article = ''; #print "result:".$new_output_list[$i]."\n"; } # check new if ($old_title eq $current_title) { #double $all_errors_of_this_article = $all_errors_of_this_article.', '.$split_line[1]; #print 'double: '.$current_title."\t".$all_errors_of_this_article."\n"; } else { $all_errors_of_this_article = $split_line[1]; #print 'normal: '.$current_title."\t".$all_errors_of_this_article."\n"; } $old_title = $current_title; } $i = $i+1; $new_output_list[$i] = $old_title."\t".$all_errors_of_this_article; } $number_of_output_list = @new_output_list; #print "Line number: ".$number_of_output_list." (after sort)\n"; my @output_sort = @new_output_list; @new_output_list = (); $number_of_articles_with_error = @output_sort; print 'article with errors:'."\t".$number_of_articles_with_error." \n"; $for_statistic_number_of_articles_with_error = $number_of_articles_with_error; #count all errors and errors per number foreach (@output_sort) { my @split_line = split(/\t/, $_); my @split_errors = split(/,/, $split_line[1]); my $counter = @split_errors; $number_of_all_errors_in_all_articles = $number_of_all_errors_in_all_articles + $counter; foreach (@split_errors) { #count errors for number (over all = live + lastscan) $error_description[$_][8] ++; } } print 'errors in all articles:'."\t".$number_of_all_errors_in_all_articles." \n"; print 'Write output files:'."\t".'sort by articlename'."\n" if ($silent_modus ne 'silent'); ######### # write output for next scan print "\t\t\t".'output for next live scan'."\n" if ($silent_modus ne 'silent'); my $live_filename = ''; $live_filename = $output_directory.$project.'/'.$project.'_'.$error_list_filename; $live_filename = $output_directory.$project.'/'.$project.'_'.$error_list_filename_only if ($dump_or_live eq 'only'); $live_filename = $output_directory.$project.'/'.$project.'_'.$error_list_filename_dump if ($dump_or_live eq 'dump'); open(OUTPUT, ">$live_filename"); my $first_line = -1; foreach (@output_sort) { if ($_) { print OUTPUT "\n" if ($first_line > -1); print OUTPUT $_; $first_line ++; } } close(OUTPUT); # backup of dumpscan if ($dump_or_live eq 'dump'){ my $live_filename_backup = $output_directory.$project.'/'.$project.'_'.$error_list_filename_backup; system ('cp '.$live_filename.' '.$live_filename_backup) ; } #output only error 30 for de:User CyruzdaViruz if ($dump_or_live eq 'dump') { print "\t\t\t".'output error 30'."\n" if ($silent_modus ne 'silent'); $live_filename = ''; $live_filename = $output_directory.$project.'/'.$project.'_'.$error_list_filename_30; open(OUTPUT, ">$live_filename"); my $counter_30 = 0; for (my $i = 0; $i <= $error_counter; $i++) { #search in all errors if ( $page_with_error[$i][0] eq '30') { print OUTPUT "\n" if ($counter_30 > 0); $counter_30 = $counter_30 + 1; my $output_line = $page_with_error[$i][2]; $output_line =~ s/<\/nowiki>$//; $output_line =~ s/^<nowiki>//; $output_line = $page_with_error[$i][1]."\t".$output_line; print OUTPUT $output_line; } } close(OUTPUT); } #output only error 37 for de:User CyruzdaViruz # output all errors print "\t\t\t".'output error all errors'."\n" if ($silent_modus ne 'silent'); for (my $i = 1; $i < $number_of_max_errors; $i ++) { my $current_error = $i; $current_error = '0'.$current_error if (length($current_error) < 2); $current_error = '0'.$current_error if (length($current_error) < 3); $live_filename = ''; $live_filename = $output_directory.$project.'/error/'.$project.'_'.$error_list_filename_every.'_'.$current_error.'.html'; if (not (-e $output_directory.$project.'/error' )) { print 'create directory:'."\t". $output_directory.$project.'/error'."\n"; mkdir($output_directory.$project.'/error' ,0777); } open(OUTPUT, ">$live_filename"); print OUTPUT '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'."\n"; print OUTPUT '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">'."\n"; print OUTPUT '<head>'."\n"; print OUTPUT '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'."\n"; print OUTPUT '</head>'."\n"; print OUTPUT '<body>'."\n"; print OUTPUT '<h1>'.'Error number '.$i.'</h1>'."\n"; my $first_line = -1; foreach (@output_sort) { my $current_line = $_; if ($current_line) { my @line_split = split (/\t/, $current_line); if ($line_split[1] =~ /(, |^)$i(, |$)/) { #only error XXX print OUTPUT "\n" if ($first_line > -1); #print OUTPUT '<a href="'.$home.$line_split[0].'>'.$line_split[0].'</a><br />'; print OUTPUT $line_split[0].'<br />'; $first_line ++; } } } print OUTPUT '</body></html>'."\n"; close(OUTPUT); } ############################################# #output error_statistic #print "Last scan\n" if ($silent_modus ne 'silent'); my $error_statistic_file = ''; $error_statistic_file = $output_directory.$project.'/'.$project.'_'.$error_statistic_filename; $error_statistic_file = $output_directory.$project.'/'.$project.'_'.$error_statistic_filename_only if ($dump_or_live eq 'only'); my @input_error_statistic; my $previous_scan_error = 0; if ( -e $error_statistic_file ) { # read statistic data open(INPUT, "<$error_statistic_file"); @input_error_statistic = <INPUT>; close(INPUT); foreach (@input_error_statistic) { my $current_stat = $_; $current_stat =~ s/\n//; my @current_stat_split = split ( /\t/, $current_stat); my $stat_number = $current_stat_split[0]; my $stat_last_value = $current_stat_split[4]; if ($stat_number > 0) { $error_description[$stat_number][7] = $stat_last_value ; } else { $previous_scan_error = $stat_last_value; } } } my $output_statistic_table = ''; if ($dump_or_live eq 'live') { print "\t\t\t".'output error statistic'."\n" if ($silent_modus ne 'silent'); open(OUTPUT, ">$error_statistic_file"); print OUTPUT '0'."\t".'all errors'."\t\t\t".$number_of_all_errors_in_all_articles."\n"; for (my $i = 0; $i < $number_of_max_errors; $i++) { if ($error_description[$i][1] ne '') { print OUTPUT $i; if ($error_description[$i][5] ne '') { print OUTPUT "\t".$error_description[$i][5]; } else { print OUTPUT "\t".$error_description[$i][1]; } print OUTPUT "\t".$error_description[$i][0]; print OUTPUT "\t".$error_description[$i][4]; print OUTPUT "\t".$error_description[$i][8]; print OUTPUT "\n"; } } close(OUTPUT); #statistic_table $output_statistic_table = $output_statistic_table.'{| class="wikitable sortable"'."\n"; $output_statistic_table = $output_statistic_table.'|- '."\n"; $output_statistic_table = $output_statistic_table.'! nr. '."\n"; $output_statistic_table = $output_statistic_table.'! name '."\n"; $output_statistic_table = $output_statistic_table.'! script '."\n"; $output_statistic_table = $output_statistic_table.'! '.$project."\n"; $output_statistic_table = $output_statistic_table.'! previous scan '."\n"; $output_statistic_table = $output_statistic_table.'! last scan '."\n"; $output_statistic_table = $output_statistic_table.'! trend '."\n"; $output_statistic_table = $output_statistic_table.'! change '."\n"; $output_statistic_table = $output_statistic_table.'|- '."\n"; for (my $i = 0; $i < $number_of_max_errors; $i++) { if ($error_description[$i][1] ne '') { $output_statistic_table = $output_statistic_table.'| '.$i." || "; my $headline_error = ''; if ($error_description[$i][5] ne '') { # foreign language #$output_statistic_table = $output_statistic_table.' '.$error_description[$i][5]." || "; $headline_error= $error_description[$i][5]; } else { # english #$output_statistic_table = $output_statistic_table.' '.$error_description[$i][1]." || "; $headline_error= $error_description[$i][1]; } if ($error_description[$i][3] > 0 ) { my $headline_error_link =$headline_error; $headline_error_link =~ s/<nowiki>//g; $headline_error_link =~ s/<\/nowiki>//g; $headline_error_link =~ s/</.3C/g; # < .3C $headline_error_link =~ s/</.3E/g; # > .3E $headline_error_link =~ s/\//.2F/g; # / .2F $headline_error = '[[#'.$headline_error_link.'|'.$headline_error.']]' } $output_statistic_table = $output_statistic_table.' '.$headline_error." || "; $output_statistic_table = $output_statistic_table.' '; $output_statistic_table = $output_statistic_table.'out of service' if ($error_description[$i][0] == -1); $output_statistic_table = $output_statistic_table.'deactivated' if ($error_description[$i][0] == 0); $output_statistic_table = $output_statistic_table.'high' if ($error_description[$i][0] == 1); $output_statistic_table = $output_statistic_table.'middle' if ($error_description[$i][0] == 2); $output_statistic_table = $output_statistic_table.'low' if ($error_description[$i][0] == 3); $output_statistic_table = $output_statistic_table." || "; $output_statistic_table = $output_statistic_table.' '; $output_statistic_table = $output_statistic_table.'unknown' if ($error_description[$i][4] == -1); $output_statistic_table = $output_statistic_table.'deactivated' if ($error_description[$i][4] == 0); $output_statistic_table = $output_statistic_table.'high' if ($error_description[$i][4] == 1); $output_statistic_table = $output_statistic_table.'middle' if ($error_description[$i][4] == 2); $output_statistic_table = $output_statistic_table.'low' if ($error_description[$i][4] == 3); $output_statistic_table = $output_statistic_table." || "; #last scan my $output_number = $error_description[$i][7]; $output_number = '' if ($output_number == 0); $output_statistic_table = $output_statistic_table.' style="text-align:right;" | '.$output_number." || "; #current scan $output_number = $error_description[$i][8]; $output_number = '' if ($output_number == 0); $output_statistic_table = $output_statistic_table.' style="text-align:right;" | '.$output_number." || "; my $diff = $error_description[$i][8] - $error_description[$i][7] ; #trend $output_statistic_table = $output_statistic_table.' ↘' if ($diff < 0); $output_statistic_table = $output_statistic_table.' →' if ($diff == 0 and $error_description[$i][8] > 0 ); $output_statistic_table = $output_statistic_table.' ↗' if ($diff > 0); $output_statistic_table = $output_statistic_table. " || "; #change $output_number = $diff; $output_number = '' if ($output_number == 0); $output_statistic_table = $output_statistic_table.' style="text-align:right;" | '.$output_number."\n"; $output_statistic_table = $output_statistic_table.'|- '."\n"; } } $output_statistic_table = $output_statistic_table.'|} '."\n\n"; my $output_page_number = $page_number -1; $output_statistic_table = $output_statistic_table.'* Last scan:'."\n"; $output_statistic_table = $output_statistic_table."** the script scanned '''".$output_page_number."''' articles:"."\n"; $output_statistic_table = $output_statistic_table."*** '''".$for_statistic_new_article."''' new articles"."\n"; $output_statistic_table = $output_statistic_table."*** '''".$for_statistic_last_change_article."''' articles with changes"."\n"; $output_statistic_table = $output_statistic_table."*** some of '''".$for_statistic_number_of_articles_with_error."''' articles with errors from last dump or scan"."\n"; $output_statistic_table = $output_statistic_table."*** '''".$for_statistic_geo_article."''' other articles (for example articles with errors in geocoordinate)."."\n"; $output_statistic_table = $output_statistic_table.'** errors in previous scan: '.$previous_scan_error."\n"; $output_statistic_table = $output_statistic_table.'** errors in last scan: '.$number_of_all_errors_in_all_articles."\n"; my $output_number_all = $number_of_all_errors_in_all_articles - $previous_scan_error; $output_statistic_table = $output_statistic_table.'** change: '.$output_number_all."\n\n"; $output_statistic_table = $output_statistic_table.'* Downloads: (HTML or TXT in UTF-8)'."\n"; $output_statistic_table = $output_statistic_table."** Next scan: normally in 24 hours, then please copy this [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_output_for_wikipedia.html page at the toolserver] to this page."."\n"; $output_statistic_table = $output_statistic_table."** [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_error_list.txt List of articles with errors] "."\n"; $output_statistic_table = $output_statistic_table."** [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_error_list_error_030.txt List of articles with error 30] "."\n"; $output_statistic_table = $output_statistic_table."** [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_error_list_error_037.txt List of articles with error 37] "."\n"; $output_statistic_table = $output_statistic_table."** [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_translation.txt Text for Translation] "; if ( $translation_page ne '') { $output_statistic_table = $output_statistic_table .'for [['.$translation_page.']]'; } $output_statistic_table = $output_statistic_table."\n"; $output_statistic_table = $output_statistic_table.'* News:'."\n"; $output_statistic_table = $output_statistic_table.'** 2009-05-07: New error 79, 80'."\n"; $output_statistic_table = $output_statistic_table.'** 2009-05-10: [[:en:Wikipedia:AutoWikiBrowser|AWB]] in its new [[:en:Wikipedia:AutoWikiBrowser/History|4.5.3.0 release]] includes several features for this project: #57 is fixed automatically; some of #7 are corrected; a special mode assists in fixing brackets (#10, #43, #46, #47) and in some cases, fixes them directly.'."\n"; $output_statistic_table = $output_statistic_table.'** 2009-05-21: New error 81 and table sortable'."\n"; $output_statistic_table = $output_statistic_table.'** 2009-05-23: New error 82 '."\n"; $output_statistic_table = $output_statistic_table.'** 2009-05-25: Split error 7 in error 7 and 83'."\n"; $output_statistic_table = $output_statistic_table ."\n\n"; } ####################################### # find all error-codes, generate a list with all errors_codes my @list_of_errors; for (my $i = 0; $i <= $error_counter; $i++) { my $test_counter = @list_of_errors; #$test_counter = $test_counter -1; #print 'Test_counter:'.$test_counter."\n"; my $found_error = 0; if ($test_counter > 0 ) { for (my $j = 0; $j < $test_counter; $j++) { if ($list_of_errors[$j] == $page_with_error[$i][0]) { $found_error = 1; } } } push(@list_of_errors, $page_with_error[$i][0] ) if ( $found_error == 0); #foreach(@list_of_errors) { # print "$_ "; #} #print "\n"; } ############################################# # sort error_list by headline of error my @sort_helper; my $number_of_different_errors = @list_of_errors; print 'different errors: '."\t".$number_of_different_errors."\n"; $number_of_different_errors = $number_of_different_errors -1; for (my $i = 0; $i <=$number_of_different_errors; $i++) { if ($error_description[$list_of_errors[$i]][5] ne '') { $sort_helper[$i] = $error_description[$list_of_errors[$i]][5]."\t".$list_of_errors[$i]; } else { $sort_helper[$i] = $error_description[$list_of_errors[$i]][1]."\t".$list_of_errors[$i]; } } my @sort_list = sort(@sort_helper); foreach (@sort_list){ #rint $_."\n"; $_ = substr($_ , index($_,"\t")+1); #rint $_."\n"; } @list_of_errors = @sort_list; ############################################# # output for wikipedia if ($dump_or_live eq 'live' or $dump_or_live eq 'dump' ) { print "\t\t\t".'output for Wikipedia'."\n" if ($silent_modus ne 'silent'); my $html_head = ''; $html_head = $html_head.'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'."\n"; $html_head = $html_head.'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">'."\n"; $html_head = $html_head.'<head>'."\n"; $html_head = $html_head.'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'."\n"; $html_head = $html_head.'</head>'."\n"; $html_head = $html_head.'<body>'."\n"; $html_head = $html_head.'<pre> <code>'."\n"; my $output_wikipedia_text = ''; my $statistic_text = ''; use CGI::Carp qw(fatalsToBrowser); my $output_error_counter = $error_counter +1; my $output_page_number = $page_number -1; #$statistic_text = $statistic_text."* Last scan: The script found '''".$error_output."''' errors in '''". $number_of_articles_with_error_output."''' articles."."\n"; $statistic_text = $statistic_text."* With the last scan the script checked '''".$output_page_number."''' articles. "."\n"; $statistic_text = $statistic_text."* At the moment the script identified '''".$number_of_all_errors_in_all_articles."''' ideas for improvement in '''".$for_statistic_number_of_articles_with_error."''' articles."."\n"; $statistic_text = $statistic_text."* Scan begin: '''".$akJahr."-".$akMonat."-".$akMonatstag." ".$akStunden.":".$akMinuten."''' and end:"; #output long list archiv my $file_output_statistic_list = $output_directory.$project.'/'.$project.'_'.$error_statistic_filename_list; print "\t\t\t".'output error statistic'."\n" if ($silent_modus ne 'silent'); open(OUTPUT_STATISTIC_LIST, "+>>$file_output_statistic_list"); print OUTPUT_STATISTIC_LIST $akJahr."-".$akMonat."-".$akMonatstag." ".$akStunden.":".$akMinuten.":".$akSekunden; get_time(); print OUTPUT_STATISTIC_LIST "\t".$akJahr."-".$akMonat."-".$akMonatstag." ".$akStunden.":".$akMinuten.":".$akSekunden; $statistic_text = $statistic_text." '''".$akJahr."-".$akMonat."-".$akMonatstag." ".$akStunden.":".$akMinuten."''' (GMT, Toolserver)"."\n"; $time_end = time(); my $duration = $time_end - $time_start; my $duration_minutes = int($duration / 60); my $duration_secounds = int(((int(100 * ($duration / 60)) / 100)-$duration_minutes)*60); $statistic_text = $statistic_text."* Duration: ".$duration_minutes.' min. '.$duration_secounds." sec.\n\n"; print OUTPUT_STATISTIC_LIST "\t".$duration_minutes."\t".$duration_secounds; print OUTPUT_STATISTIC_LIST "\t".$number_of_all_errors_in_all_articles."\n"; close(OUTPUT_STATISTIC_LIST); my $error_table_text = ''; print "\t\t\t".'Build wiki-structure'."\n" if ($silent_modus ne 'silent'); for (my $k = 1; $k <= 3; $k ++) { # priority if ($k == 1) { # Top priority if ($top_priority_project eq ''){ $error_table_text = $error_table_text. "\n".'== '.$top_priority_script.' =='."\n\n" } else { $error_table_text = $error_table_text. "\n".'== '.$top_priority_project.' =='."\n\n" } } if ($k == 2) { # middle priority if ($middle_priority_project eq ''){ $error_table_text = $error_table_text. "\n".'== '.$middle_priority_script.' =='."\n\n" } else { $error_table_text = $error_table_text. "\n".'== '.$middle_priority_project.' =='."\n\n" } } if ($k == 3) { # lowest priority if ($lowest_priority_project eq ''){ $error_table_text = $error_table_text. "\n".'== '.$lowest_priority_script.' =='."\n\n" } else { $error_table_text = $error_table_text. "\n".'== '.$lowest_priority_project.' =='."\n\n" } } #$error_table_text = $error_table_text. "\n".'== Top priority =='."\n\n" #$error_table_text = $error_table_text. "\n".'== Middle priority =='."\n\n" if ($k == 2); #$error_table_text = $error_table_text. "\n".'== Lowest priority =='."\n\n" if ($k == 3); foreach(@list_of_errors) { my $j = $_; if ( ($error_description[$j][0] == $k and $error_description[$j][4] == -1) # script priority or ($error_description[$j][4] == $k ) # foreign language priority ) { # if an error is existing #print 'Error '.$j."\n"; my $error_text = ''; if ( $error_description[ $j ][5] ne '') { # foreign language headline and description $error_table_text = $error_table_text. "\n".'=== '.$error_description[ $j ][5].' ==='."\n"; $error_table_text = $error_table_text. '<!-- error number '. $j .' -->'."\n"; $error_table_text = $error_table_text. $error_description[ $j ][6]."\n\n"; } else { # english headline and description $error_table_text = $error_table_text. "\n".'=== '.$error_description[ $j ][1].' ==='."\n"; $error_table_text = $error_table_text. '<!-- error number '. $j .' -->'."\n"; $error_table_text = $error_table_text. $error_description[ $j ][2]."\n\n"; } my $tabelle = "true"; $tabelle = "false" if ($j eq '37'); $tabelle = "false" if ($j eq '3'); # find the number errors with this error-code my $error_counter_this_error = 0; for (my $i = 0; $i <= $error_counter; $i++) { # search all errors if ($j eq $page_with_error[$i][0]) { $error_counter_this_error = $error_counter_this_error + 1; } } # select every X. article my $step_size = 1; $step_size = int( $error_counter_this_error / $max_error_count) if ($error_counter_this_error > $max_error_count); #print 'Step size '.$step_size."\n"; my $error_counter_step = 0; # count the steps my $to_max_error_count = $max_error_count; # number of article to max for (my $i = 0; $i <= $error_counter; $i++) { #search in all errors if ($j eq $page_with_error[$i][0]) { $error_counter_step = $error_counter_step + 1; if ($error_counter_step == $step_size and $to_max_error_count > 0) { # must inside $error_counter_step = 0; $to_max_error_count = $to_max_error_count -1; if ( $tabelle eq "false" ) { # one row without table $error_text =~ s/\n//; $error_text = $error_text.", [[:".$page_with_error[$i][1]."]]\n"; $error_text =~ s/^, //; } else { # one roe in a table $error_text = $error_text.'|-'."\n".'| [[:'; $error_text = $error_text.$page_with_error[$i][1]; $error_text = $error_text."]] || ".$page_with_error[$i][2]."\n"; } } } } if ($tabelle eq 'true') { $error_text = '{| class="wikitable sortable"'."\n".'! article'."\n".'! info '."\n".'|-'."\n".$error_text.'|}'; } $error_table_text = $error_table_text. "This error was found '''". $error_description[$j][8] ."''' times."; $error_table_text = $error_table_text. ' - This output was limited to '.$max_error_count.' article.' if ($error_description[$j][8] > $max_error_count); my $current_error = $j; $current_error = '0'.$current_error if (length($current_error) < 2); $current_error = '0'.$current_error if (length($current_error) < 3); $error_table_text = $error_table_text. " - [http://toolserver.org/~sk/checkwiki/".$project.'/'.$project."_error_list_error_".$current_error.".html List of all articles with error ".$current_error."]"; $error_table_text = $error_table_text. "\n\n"; $error_table_text = $error_table_text. $error_text; } } } $output_wikipedia_text = $output_wikipedia_text.$start_text; $output_wikipedia_text = $output_wikipedia_text.$statistic_text; $output_wikipedia_text = $output_wikipedia_text.$description_text; $output_wikipedia_text = $output_wikipedia_text.$output_statistic_table; $output_wikipedia_text = $output_wikipedia_text.$error_table_text; $output_wikipedia_text = $output_wikipedia_text. "\n\n"; $output_wikipedia_text = $output_wikipedia_text.$category_text; $output_wikipedia_text = $output_wikipedia_text. "\n\n"; $output_wikipedia_text = $output_wikipedia_text. '[[af:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'afwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[ar:ويكيبيديا:فحص ويكيبيديا]]'."\n" if ($project ne 'arwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[ca:Viquipèdia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'cawiki') ; #$output_wikipedia_text = $output_wikipedia_text. '[[commons:Commons:WikiProject Check Wikipedia]]'."\n" if ($project ne 'commonswiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[cs:Wikipedie:WikiProjekt Check Wikipedia]]'."\n" if ($project ne 'cswiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[cy:Wicipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'cywiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[da:Wikipedia:WikiProjekt Check Wikipedia]]'."\n" if ($project ne 'dawiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[de:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'dewiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[en:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'enwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[eo:Vikipedio:WikiProjekt Check Wikipedia]]'."\n" if ($project ne 'eowiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[es:Wikiproyecto:Check Wikipedia]]'."\n" if ($project ne 'eswiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[fi:Wikipedia:Wikiprojekti Check Wikipedia]]'."\n" if ($project ne 'fiwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[fr:Projet:Correction syntaxique]]'."\n" if ($project ne 'frwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[fy:Wikipedy:WikiProject Check Wikipedia]]'."\n" if ($project ne 'fywiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[he:ויקיפדיה:Check Wikipedia]]'."\n" if ($project ne 'hewiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[hu:Wikipédia:Ellenőrzőműhely]]'."\n" if ($project ne 'huwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[id:Wikipedia:WikiProjekt Check Wikipedia]]'."\n" if ($project ne 'idwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[is:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'iswiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[it:Wikipedia:Elenchi generati offline/Check Wikipedia]]'."\n" if ($project ne 'itwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[ja:Wikipedia:ウィキプロジェクト ウィキ文法のチェック]]'."\n" if ($project ne 'jawiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[la:Vicipaedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'lawiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[nds:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'ndswiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[nds-nl:Wikipedie:WikiProject Check Wikipedia]]'."\n" if ($project ne 'nds_nlwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[nl:Wikipedia:Wikiproject/Check Wikipedia]]'."\n" if ($project ne 'nlwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[no:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'nowiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[pdc:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'pdcwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[pl:Wikiprojekt:Check Wikipedia]]'."\n" if ($project ne 'plwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[pt:Wikipedia:Projetos/Check Wikipedia]]'."\n" if ($project ne 'ptwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[ro:Wikipedia:WikiProject Check Wikipedia]]'."\n" if ($project ne 'rowiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[ru:Википедия:Страницы с ошибками в викитексте]]'."\n" if ($project ne 'ruwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[sk:Wikipédia:WikiProjekt Check Wikipedia]]'."\n" if ($project ne 'skwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[sv:Wikipedia:Projekt wikifiering/Syntaxfel]]'."\n" if ($project ne 'svwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[tr:Vikipedi:Vikipedi proje kontrolü]]'."\n" if ($project ne 'trwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[uk:Вікіпедія:Проект:Check Wikipedia]]'."\n" if ($project ne 'ukwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[yi:װיקיפּעדיע:קאנטראלירן_בלעטער]]'."\n" if ($project eq 'yiwiki') ; $output_wikipedia_text = $output_wikipedia_text. '[[zh:维基百科:专题/错误检查]]'."\n" if ($project ne 'zhwiki') ; my $html_foot = ''; $html_foot = $html_foot.'</body></html>'."\n"; $html_foot = $html_foot.'<pre> <code>'."\n"; my $output_filename = ''; $output_filename = $output_directory.$project.'/'.$project.'_'.$output_live_wiki; $output_filename = $output_directory.$project.'/'.$project.'_'.$output_dump_wiki if ($dump_or_live eq 'dump'); $output_filename_html = $output_filename; $output_filename_html =~ s/\.txt/\.html/; #$live_filename = $output_directory.$project.'/'.$project.'_only_'.$error_list_filename if ($dump_or_live eq 'only'); open(OUTPUT, ">$output_filename"); print OUTPUT $output_wikipedia_text; close(OUTPUT); open(OUTPUT, ">$output_filename_html"); print OUTPUT $html_head; my $output_wikipedia_text_html = $output_wikipedia_text; $output_wikipedia_text_html =~ s/</</g; $output_wikipedia_text_html =~ s/>/>/g; print OUTPUT $output_wikipedia_text_html; print OUTPUT $html_foot; close(OUTPUT); ####################### # output every error in a file alone output_geo() if (-e $file_module_coordinate) ; } #open(ERROR_LIST, ">>".$output_directory.'/'.$project.'/'.$project.'_'.$error_list_filename); #print $output_directory.'/'.$project.'_'.$error_list_filename."\n"; #print ERROR_LIST $title."\t".$error_code."\n"; #close (ERROR_LIST); } sub output_statistic { $time_end = time(); my $duration = $time_end - $time_start; my $duration_minutes = int($duration / 60); my $duration_secounds = int(((int(100 * ($duration / 60)) / 100)-$duration_minutes)*60); print 'Duration:'."\t\t".$duration_minutes.' minutes '.$duration_secounds.' secounds'."\n"; print $project.' '.$dump_or_live."\n" if ($silent_modus ne 'silent'); } ############################################################################# sub check_article{ my $steps = 500; $steps = 1 if ($dump_or_live eq 'live'); $steps = 5000 if ($silent_modus eq 'silent'); if ( $title eq 'At-Tabarī' or $title eq 'Rumänien' or $title eq 'Liste der Ortsteile im Saarland') { # $details_for_page = 'yes'; } my $text_for_tests = "Hallo Barnaby, Wendy. The Plague Makers: The Secret World of Biological Warfare, Frog Ltd, 1999. in en [[Japanese war crimes]] === Test === ISBN 1-883319-85-4 ISBN 0-7567-5698-7 ISBN 0-8264-1258-0 ISBN 0-8264-1415-X * Tulku - ISBN 978 90 04 12766 0 (wrong ISBN) :-sdfsdf :#sadf ISBN 3-8304-1007-7 ok ISBN 3-00-016815-X ok ISBN 978-0-8330-3930-9 ok ISBN3-00-016815-X [[Category:abc]] and [[Category:Abc]] [[1911 př. n. l.|1911]]–[[1897 př. n. l.|1897]] př. n. l. Rodné jméno = <hiero><--M17-Y5:N35-G17-F4:X1--></hiero> <br /> Trůnní jméno = <hiero>M23-L2-<--N5:S12-D28*D28:D28--></hiero><br /> <references group='Bild' /> 124345 ===This is a headline with reference <ref>A reference with '''bold''' text</ref>=== Nubkaure <hiero>-V28-V31:N35-G17-C10-</hiero> Jméno obou paní = <hiero>-G16-V28-V31:N35-G17-C10-</hiero><br /> [[Image:logo|thumb| < small> sdfsdf</small>]] <ref>Abu XY</ref> im text ISBN 3-8304-1007-7 im text <-- ok im text ISBN 3-00-016815-X im text ok im text ISBN 978-0-8330-3930-9 im text ok [[Image:logo|thumb| Part < small> Part2</small> Part2]] [[Image:logo|thumb| Part < small> Part</small>]] ISBN-10 3-8304-1007-7 bad ISBN-10: 3-8304-1007-7 bad ISBN-13 978-0-8330-3930-9 bad ISBN-13: 978-0-8330-3930-9 -->bad <ref>Abu XY</ref> ISBN 123451678XXXX bad ISBN 123456789x ok ISBN 3-00-0168X5-X bad *ISBN 3-8304-1007-7 121 Test ok *ISBN 3-8304-1007-7121 Test bad *ISBN 3 8304 1007 7 121 Test ok *ISBN 978-0-8330-39 309 Test ok *ISBN 9 7 8 0 8 3 3 0 3 9 3 0 9 Test bad 10 ok 13 [http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109] bad {{test|ISBN=3 8304 1007 7 121 |test=[[text]]}} bad [https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau] bad ISBN 3-8304-1007-7 <\br> </br> [[:hu:A Gibb fivérek által írt dalok listája]] Big Problem [[en:Supermann]] testx === Liste === === 1Acte au sens d'''instrumentum'' === === 2Acte au sens d'''instrumentum''' === === 3Acte au sens d''instrumentum'' === ISBN 978-88-10-24109-7 * ISBN 0-691-11532-X ok * ISBN 123451678XXXX bad * ISBN-10 1234567890 bad * ISBN-10: 1234567890 bad * ISBN-13 1234567890123 bad * ISBN-13: 1234567890123 bad * ISBN 123456789x Test ok * ISBN 123456789x x12 Test * ISBN 123456789012x Test * ISBN 1234567890 12x Test * ISBN 123456789X 123 Test * ISBN 1 2 3 4 5 6 7 8 9 0 Test [http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109] [https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau] * Tramlijn_Ede_-_Wageningen - ISBN-nummer * Tulku - ISBN 978 90 04 12766 0 (wrong ISBN) * Michel_Schooyans - [http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109] *VARA_gezinsencyclopedie - [https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau] Testtext hat einen [[|Link]], der nirgendwo hinführt.<ref>Kees Heitink en Gert Jan Koster, De tram rijdt weer!: Bennekomse tramgeschiedenis 1882 - 1937 - 1968 - 2008, 28 bladzijden, geen ISBN-nummer, uitverkocht.</ref>. === 4Acte au sens d''instrumentum'' === [[abszolútérték-függvény||]] ''f''(''x'') – ''f''(''y'') [[abszolútérték-függvény||]] aus huwiki * [[Antwerpen (stad)|Antwerpen]] heeft na de succesvolle <BR>organisatie van de Eurogames XI in [[2007]] voorstellen gedaan om editie IX van de Gay Games in [[2014]] of eventueel de 3e editie van de World OutGames in [[2013]] naar Antwerpen te halen. Het zogeheten '[[bidbook]]' is ingediend en het is afwachten op mogelijke toewijzing door de internationale organisaties. <br> *a[[B:abc]]<br> *bas addf< br> *casfdasdf< br > *das fdasdf< br / > [[Chełmno]] and sdfsf ISBN 3434462236 95-98. ISBN 0 7876 5784 0. . === UNO MANDAT === 0-13-110370-9 * [http://www.research.att.com/~bs/3rd.html The C++ Programming Language]: [[Bjarne Stroustrup]], special ed., Addison-Weslye, ISBN 0-201-70073-5, 2000 * The C++ Standard, Incorporating Technical Corrigendum 1, BS ISO/IEC 14882:2003 (2nd ed.), John Wiley & Sons, ISBN 0-470-84674-7 * [[Brian Kernighan|Brian W. Kernighan]], [[Dennis Ritchie|Dennis M. Ritchie]]: ''[[The C Programming Language]]'', Second Edition, Prentice-Hall, ISBN 0-13-110370-9 1988 * [http://kmdec.fjfi.cvut.cz/~virius/publ-list.html#CPP Programování v C++]: Miroslav Virius, [http://www.cvut.cz/cs/uz/ctn Vydavatelství ČVUT], druhé vydání, ISBN 80-01-02978-6 2004 * Naučte se C++ za 21 dní: Jesse Liberty, [http://www.cpress.cz/ Computer Press], ISBN 80-7226-774-4, 2002 * Programovací jazyk C++ pro zelenáče: Petr Šaloun, [http://www.neo.cz Neokortex] s.r.o., ISBN 80-86330-18-4, 2005 * Rozumíme C++: Andrew Koenig, Barbara E. Moo, [http://www.cpress.cz/ Computer Press], ISBN 80-7226-656-X, 2003 * [http://gama.fsv.cvut.cz/~cepek/uvodc++/uvodc++-2004-09-11.pdf Úvod do C++]: Prof. Ing. Aleš Čepek, CSc., Vydavatelství ČVUT, 2004 *eaa[[abc]]< br / > <ref>sdfsdf</ref> . Verlag LANGEWIESCHE, ISBN-10: 3784551912 und ISBN-13: 9783784551913 === Meine Überschrift ABN === ISBN 1234-X-1234 *fdd asaddf…</br 7> {{Zitat|Der Globus ist schön. <ref name='asda'>Buch 27</ref>}} {{Zitat|Madera=1000 <ref name='asda'>Buch 27</ref>|Kolumbus{{Höhe|name=123}}|kirche=4 }} ==== Саларианцы ==== [[Breslau]] ([[Wrocław]]) *gffasfdasdf<\br7> {{Testvorlage|name=heeft na de succesvolle <BR>organisatie van de [[Eurogames XIa|Eurogames XI]] inheeft na de succesvolle <BR>organisatie van de Eurogames XI inheeft na de succesvolle <BR>organisatie van de Eurogames XI in123<br>|ott]o=kao}} *hgasfda sdf<br /> <ref>sdfsdf2</ref>! <br><br> ===== PPM, PGM, PBM, PNM ===== " .'test<br1/><br/1>–uberlappung<references />3456Ende des Text'; #$text = $text_for_tests; get_namespace(); print_article_title_every_x( $steps ); get_comments_nowiki_pre(); get_math(); get_source(); get_code(); get_isbn(); get_templates(); get_links(); get_images(); get_tables(); get_gallery(); get_hiero(); #problem with <-- and --> (error 056) get_ref(); check_for_redirect(); get_categories(); get_interwikis(); create_line_array(); get_line_first_blank(); get_headlines(); error_check(); get_coordinates() if (-e $file_module_coordinate) ; #get_persondata(); } sub print_article_title_every_x{ #print in the Loop every x article a short message #Output every x articles my $steps =$_[0]; #print "$page_number \t$title\n"; my $x = int( $page_number / $steps ) * $steps ; if ($page_number == 1 or $page_number == $x ) { my $project_output = $project; $project_output =~ s/wiki//; print $project_output.' '; print 'p='.$page_number.' '; if ($dump_or_live eq 'live') { my $output_current_live_article = $current_live_article + 1; print $current_live_error_scan.'/'.$output_current_live_article.'/'.$number_article_live_to_scan; } else { print 'id='.$page_id ; } print ' '.$title."\n"; } } sub get_namespace{ #check the namespace of an article # if here is an error then maybe it is a new namespace in this project; show sub load_metadata_from_file if ( index( $title, ':' ) > -1) { #print $title."\n"; for (my $i = 0; $i < $namespaces_count; $i++) { #print $i." ".$namespace[$i][0]." ".$namespace[$i][1]." ".$namespace[$i][2] ."\n" ;#if ($title eq 'Sjabloon:Gemeente'); $page_namespace = $namespace[$i][0] if ( index ($title, $namespace[$i][1].':') == 0); $page_namespace = $namespace[$i][0] if ( index ($title, $namespace[$i][2].':') == 0); } #print $page_namespace."\n" ;#if ($title eq 'Sjabloon:Gemeente'); #print $namespacealiases_count."\n"; for (my $i = 0; $i < $namespacealiases_count; $i++) { #print $i." ".$namespacealiases[$i][0]." ".$namespacealiases[$i][1] ."\n" ;#if ($title eq 'Sjabloon:Gemeente'); $page_namespace = $namespacealiases[$i][0] if ( index ($title, $namespacealiases[$i][1].':') == 0); } #print $page_namespace."\n" ;#if ($title eq 'Sjabloon:Gemeente'); $page_namespace = 0 if ($page_namespace == -100); } else { $page_namespace = 0; } } sub get_comments_nowiki_pre{ my $last_pos = -1; my $pos_comment = -1; my $pos_nowiki = -1; my $pos_pre = -1; my $pos_first = -1; my $loop_again = 0; do { # next tag $pos_comment = index ($text, '<!--', $last_pos); $pos_nowiki = index ($text, '<nowiki>', $last_pos); $pos_pre = index ($text, '<pre>', $last_pos); #print $pos_comment.' '.$pos_nowiki.' '.$pos_pre."\n"; #first tag $tag_first = ''; $tag_first = 'comment' if( $pos_comment > -1 ); $tag_first = 'nowiki' if( ( $pos_nowiki > -1 and $tag_first eq '') or( $pos_nowiki > -1 and $tag_first eq 'comment' and $pos_nowiki < $pos_comment)); $tag_first = 'pre' if( ( $pos_pre > -1 and $tag_first eq '') or( $pos_pre > -1 and $tag_first eq 'comment' and $pos_pre < $pos_comment) or( $pos_pre > -1 and $tag_first eq 'nowiki' and $pos_pre < $pos_nowiki)); #print $tag_first."\n"; #check end tag my $pos_comment_end = index ($text, '-->', $pos_comment + length('<!--') ); my $pos_nowiki_end = index ($text, '', $pos_nowiki + length('') ); my $pos_pre_end = index ($text, '</pre>', $pos_pre + length('<pre>') ); #comment if ($tag_first eq 'comment' and $pos_comment_end > -1) { #found <!-- and --> $last_pos = get_next_comment($pos_comment + $last_pos); $loop_again = 1; #print 'comment'.' '.$pos_comment.' '.$last_pos."\n"; } if ($tag_first eq 'comment' and $pos_comment_end == -1) { #found <!-- and no --> $last_pos = $pos_comment +1; $loop_again = 1; #print 'comment no end'."\n"; my $text_output = substr( $text, $pos_comment); $text_output = text_reduce($text_output, 80); error_005_Comment_no_correct_end ( $text_output ); #print $text_output."\n"; } #nowiki if ($tag_first eq 'nowiki' and $pos_nowiki_end > -1) { # found <nowiki> and
#pre if ($tag_first eq 'pre' and $pos_pre_end > -1) {
# found
and
$last_pos = get_next_pre($pos_pre + $last_pos); $loop_again = 1; #print 'pre'.' '.$pos_pre.' '.$last_pos."\n"; } if ($tag_first eq 'pre' and $pos_pre_end == -1) {
# found
and no
#print $last_pos.' '.$pos_pre."\n"; $last_pos = $pos_pre +1; $loop_again = 1; #print 'pre no end'."\n"; my $text_output = substr( $text,$pos_pre); $text_output = text_reduce($text_output, 80); error_024_pre_no_correct_end ( $text_output); }
#end if ($pos_comment == -1 and $pos_nowiki == -1 and $pos_pre == -1) { # found no ', $pos_start + length(''); $comment_counter = $comment_counter +1; $comments[$comment_counter][0] = $pos_start; $comments[$comment_counter][1] = $pos_end; $comments[$comment_counter][2] = substr($text, $pos_start, $pos_end - $pos_start ); #print $comments[$comment_counter][2]."\n";
#replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; $result = $pos_end; } return ($result );
}
sub get_math {
my $pos_start_old = 0;
my $pos_end_old = 0;
my $end_search = 'yes';
do {
my $pos_start = 0;
my $pos_end = 0;
$end_search = 'yes';
#get position of next Parser nie mógł rozpoznać (błąd składni): {\displaystyle $pos_start = index ( lc($text), '<math>' , $pos_start_old); my $pos_start2 = index ( lc($text), '<math style=' , $pos_start_old); my $pos_start3 = index ( lc($text), '<math title=' , $pos_start_old); my $pos_start4 = index ( lc($text), '<math alt=' , $pos_start_old); #print $pos_start.' '. $pos_end .' '.$pos_start2."\n"; if ($pos_start == -1 or ($pos_start > -1 and $pos_start2 > -1 and $pos_start > $pos_start2 )){ $pos_start = $pos_start2; } if ($pos_start == -1 or ($pos_start > -1 and $pos_start3 > -1 and $pos_start > $pos_start3 )){ $pos_start = $pos_start3; } if ($pos_start == -1 or ($pos_start > -1 and $pos_start4 > -1 and $pos_start > $pos_start4 )){ $pos_start = $pos_start4; } $pos_end = index ( lc($text), '} ' , $pos_start + length('<math')) ;
#print $pos_start.' '. $pos_end ."\n"; if ($pos_start > -1 and $pos_end >-1) { #found a math in current page $pos_end = $pos_end + length('</math>'); #print substr($text, $pos_start, $pos_end - $pos_start )."\n";
$end_search = 'no'; $pos_start_old = $pos_end;
#replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { error_013_Math_no_correct_end ( substr( $text, $pos_start, 50) ); #print 'Math:'.substr( $text, $pos_start, 50)."\n"; $end_search = 'yes'; }
} until ( $end_search eq 'yes') ; }
sub get_source { my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes';
do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes';
#get position of next Parser nie mógł rozpoznać (błąd składni): {\displaystyle $pos_start = index ( $text, '<source', $pos_start_old); $pos_end = index ( $text, '</source>', $pos_start + length( '<source') ) ; if ($title eq 'ALTER'){ print $pos_start."\n"; print $pos_end."\n"; } if ($pos_start > -1 and $pos_end >-1) { #found a math in current page $pos_end = $pos_end + length('</source>'); #print substr($text, $pos_start, $pos_end - $pos_start )."\n"; $end_search = 'no'; $pos_start_old = $pos_end; #replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ''; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { error_014_Source_no_correct_end ( substr( $text, $pos_start, 50) ); #print 'Source:'.substr( $text, $pos_start, 50)."\n"; $end_search = 'yes'; } } until ( $end_search eq 'yes') ; } sub get_code { my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes'; do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes'; #get position of next <math> $pos_start = index ( $text, '<code>', $pos_start_old); $pos_end = index ( $text, '</code>', $pos_start ) ; if ($pos_start > -1 and $pos_end >-1) { #found a math in current page $pos_end = $pos_end + length('</code>'); #print substr($text, $pos_start, $pos_end - $pos_start )."\n"; $end_search = 'no'; $pos_start_old = $pos_end; #replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ''; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { error_015_Code_no_correct_end ( substr( $text, $pos_start, 50) ); #print 'Code:'.substr( $text, $pos_start, 50)."\n"; $end_search = 'yes'; } } until ( $end_search eq 'yes') ; } ################################################################## sub get_isbn { # get all isbn if (index ($text, 'ISBN') > 0 and $title ne 'International Standard Book Number' and $title ne 'ISBN' and $title ne 'ISBN-10' and $title ne 'ISBN-13' and $title ne 'Internationaal Standaard Boeknummer' and $title ne 'International Standard Book Number' # better with show too interwiki !!! ) { my $text_test = $text; #print "\n\n".'###################################################'."\n"; while($text_test =~ /ISBN([ ]|[-]|[=])/g) { my $pos_start = pos($text_test) - 5; #print "\n\n"; #print $pos_start."\n"; my $current_isbn = substr($text_test, $pos_start); my $output_isbn = substr ($current_isbn,0,50); $output_isbn =~ s/\n/ /g; #print $output_isbn."\n"; my $result_isbn = ''; my $i = -1; my $finish = 'no'; #print 'isbn: '."\t".$current_isbn."\n"; #die; # \tab $current_isbn =~ s/\t/ /; if ( $current_isbn =~ /^([ ]+)?ISBN=([ ]+)?/) { #print 'ISBN in Link'."\n"; # ISBN = 01234566 in templates $current_isbn =~ s/^([ ]+)?ISBN([ ]+)?=([ ]+)?/ /; #if ( length($current_isbn ) == 10 my $pos_open = index($current_isbn, '['); my $pos_close = index($current_isbn, ']'); #print $pos_open."\n"; #print $pos_close."\n"; if ( ($pos_open == -1 and $pos_close > -1) or ($pos_open > -1 and $pos_close > -1 and $pos_open > $pos_close ) ) { # [[nl:Michel_Schooyans]] - [http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109] #print "\t".'Get ISBN: ISBN in Link: '."\t"."\n"; $current_isbn = 'ISBN'; } } if ( $current_isbn =~ /^([ ]+)?ISBN-[^1]/ ) { # text "ISBN-number" # text "ISBN-bureau" #print "\t".'Get ISBN: ISBN with Minus'."\t"."\n"; $current_isbn = 'ISBN'; } #print "\t".'Get ISBN 2: '."\t".substr($current_isbn, 0, 45)."\n"; my $pos_next_ISBN = index($current_isbn, 'ISBN', 4); if ($pos_next_ISBN > -1) { #many ISBN behind the first ISBN # "ISBN 1-883319-85-4 ISBN 0-7567-5698-7 ISBN 0-8264-1258-0 ISBN 0-8264-1415-X") $current_isbn = substr ( $current_isbn , 0, $pos_next_ISBN); } $current_isbn =~ s/ISBN//g; #print "\t".'Get ISBN 2b: '."\t".substr($current_isbn, 0, 45)."\n"; do { $i ++; if ( $i <= length($current_isbn) ) { my $character = substr($current_isbn, $i, 1 ); if ($character =~ /[ 0-9Xx\-]/) { $result_isbn = $result_isbn .$character; } else { $finish = 'yes'; } } else { $finish = 'yes'; } } until ($finish eq 'yes'); if ($result_isbn =~ /[^ ]/ and $result_isbn =~ /[0-9]/ ) { $result_isbn =~ s/^([ ]+)?//g; $result_isbn =~ s/([ ]+)?$//g; #print "\t".'Get ISBN 2: '."\t".$result_isbn."\n"; push (@isbn, $result_isbn); check_isbn( $result_isbn); } } } } sub check_isbn{ my $current_isbn = $_[0]; #print 'check: '."\t".$current_isbn."\n"; # length my $test_isbn = $current_isbn; $test_isbn =~ s/^([ ]+)?//g; $test_isbn =~ s/([ ]+)?$//g; $test_isbn =~ s/[ ]//g; #print "\t".'Check ISBN 1: '."\t_".$test_isbn."_\n"; my $result = 'yes'; # length of isbn if ($result eq 'yes') { if ( index ($test_isbn, '-10') == 0 or index ($test_isbn, '-13') == 0) { $result = 'no'; error_069_isbn_wrong_syntax( $current_isbn ); } } $test_isbn =~ s/-//g; #print "\t".'Check ISBN 2: '."\t_".$test_isbn."_\n"; # wrong position of X if ($result eq 'yes') { $test_isbn =~ s/x/X/g; if ( index($test_isbn, 'X') >-1 ) { # ISBN with X #print "\t".'Check ISBN X: '."\t_".$test_isbn."_\n"; if ( index($test_isbn, 'X') != 9) { # ISBN 123456X890 $result = 'no'; error_071_isbn_wrong_pos_X( $current_isbn ); } if (index($test_isbn, 'X') == 9 and (length($test_isbn) != 10) ) { # ISBN 123451678XXXX b $test_isbn = substr($test_isbn, 0, 10); #print "\t".'Check ISBN X reduce length: '.$test_isbn."\n"; } } } my $check_10 = 'no ok'; my $check_13 = 'no ok'; my $found_text_10 = ''; my $found_text_13 = ''; # Check Checksum 13 if ($result eq 'yes') { if (length($test_isbn) >= 13 and $test_isbn =~/^[0-9]{13}/ ) { my $checksum = 0; $checksum = $checksum + 1 * substr($test_isbn,0,1); $checksum = $checksum + 3 * substr($test_isbn,1,1); $checksum = $checksum + 1 * substr($test_isbn,2,1); $checksum = $checksum + 3 * substr($test_isbn,3,1); $checksum = $checksum + 1 * substr($test_isbn,4,1); $checksum = $checksum + 3 * substr($test_isbn,5,1); $checksum = $checksum + 1 * substr($test_isbn,6,1); $checksum = $checksum + 3 * substr($test_isbn,7,1); $checksum = $checksum + 1 * substr($test_isbn,8,1); $checksum = $checksum + 3 * substr($test_isbn,9,1); $checksum = $checksum + 1 * substr($test_isbn,10,1); $checksum = $checksum + 3 * substr($test_isbn,11,1); #print 'Checksum: '."\t".$checksum."\n"; my $checker = 10 - substr($checksum,length($checksum)-1,1); $checker = 0 if ($checker == 10); #print $checker."\n"; if ( $checker eq substr($test_isbn,12,1) ){ $check_13 = 'ok'; } else { $found_text_13 = $current_isbn .'</nowiki> || <nowiki>'. substr($test_isbn,12,1).' vs. '.$checker ; } } } # Check Checksum 10 if ($result eq 'yes') { if (length($test_isbn) >= 10 and $test_isbn =~/^[0-9X]{10}/ and $check_13 eq 'no ok' ) { my $checksum = 0; $checksum = $checksum + 1 * substr($test_isbn,0,1); $checksum = $checksum + 2 * substr($test_isbn,1,1); $checksum = $checksum + 3 * substr($test_isbn,2,1); $checksum = $checksum + 4 * substr($test_isbn,3,1); $checksum = $checksum + 5 * substr($test_isbn,4,1); $checksum = $checksum + 6 * substr($test_isbn,5,1); $checksum = $checksum + 7 * substr($test_isbn,6,1); $checksum = $checksum + 8 * substr($test_isbn,7,1); $checksum = $checksum + 9 * substr($test_isbn,8,1); #print 'Checksum: '."\t".$checksum."\n"; my $checker = $checksum % 11; #print $checker."\n"; if ( ($checker < 10 and $checker ne substr($test_isbn,9,1) ) or ($checker == 10 and 'X' ne substr($test_isbn,9,1) ) ){ # check wrong and 10 or more characters $found_text_10 = $current_isbn .'</nowiki> || <nowiki>'. substr($test_isbn,9,1).' vs. '.$checker.' ('.$checksum.' mod 11)' ; } else { $check_10 = 'ok' ; } } } # length of isbn if ($result eq 'yes' and not( $check_10 eq 'ok' or $check_13 eq 'ok') ){ if ( $check_10 eq 'no ok' and $check_13 eq 'no ok' and length($test_isbn) == 10 ){ $result = 'no'; error_072_isbn_10_wrong_checksum ($found_text_10); } if ( $check_10 eq 'no ok' and $check_13 eq 'no ok' and length($test_isbn) == 13 ){ $result = 'no'; error_073_isbn_13_wrong_checksum ($found_text_13); } if ( $check_10 eq 'no ok' and $check_13 eq 'no ok' and $result eq 'yes' and length($test_isbn) != 0 ) { $result = 'no'; error_070_isbn_wrong_length( $current_isbn .'</nowiki> || <nowiki>'. length($test_isbn) ); } } #if ($result eq 'yes') { # print "\t".'Check ISBN: all ok!'."\n"; #} else { # print "\t".'Check ISBN: wrong ISBN!'."\n"; #} } ################################################################## sub get_templates{ # filter all templates my $pos_start = 0; my $pos_end = 0; my $text_test = $text; #$text_test = 'abc{{Huhu|name=1|otto=|die=23|wert=as|wertA=[[Dresden|Pesterwitz]] Mein|wertB=1234}} #{{ISD|123}} {{ESD {{Test|dfgvb}}|123}} {{tzu}} {{poil|ert{{eret|er}}|qwezh}} {{xtesxt} und außerdem #{{Frerd|qwer=0|asd={{mytedfg|poil={{1234|12334}}}}|fgh=123}} und {{mnb|jkl=12|fgh=78|cvb=4567} Ende.'; #print $text_test ."\n\n\n"; $text_test =~ s/\n//g; # delete all breaks --> only one line $text_test =~ s/\t//g; # delete all tabulator --> better for output @templates_all = (); while($text_test =~ /\{\{/g) { #Begin of template my $pos_start = pos($text_test) - 2; my $temp_text = substr ( $text_test, $pos_start); my $temp_text_2 = ''; my $beginn_curly_brackets = 1; my $end_curly_brackets = 0; while($temp_text =~ /\}\}/g) { # Find currect end - number of {{ == }} my $pos_end = pos($temp_text); $temp_text_2 = substr ( $temp_text, 0, $pos_end); $temp_text_2 = ' '.$temp_text_2.' '; #print $temp_text_2."\n"; # test the number of {{ and }} my $temp_text_2_a = $temp_text_2; $beginn_curly_brackets = ($temp_text_2_a =~ s/\{\{//g); my $temp_text_2_b = $temp_text_2; $end_curly_brackets = ($temp_text_2_b =~ s/\}\}//g); #print $beginn_curly_brackets .' vs. '.$end_curly_brackets."\n"; last if ($beginn_curly_brackets eq $end_curly_brackets); } if ($beginn_curly_brackets == $end_curly_brackets ) { # template is correct $temp_text_2 = substr ($temp_text_2, 1, length($temp_text_2) -2); #print 'Template:'.$temp_text_2."\n" if ($details_for_page eq 'yes'); push (@templates_all, $temp_text_2); } else { # template has no correct end $temp_text = text_reduce($temp_text, 80); error_043_template_no_correct_end($temp_text); #print 'Error: '.$title.' '.$temp_text."\n"; } } # extract for each template all attributes and values my $number_of_templates = -1; my $template_part_counter = -1; my $output = ''; foreach (@templates_all) { my $current_template = $_; #print 'Current templat:_'.$current_template."_\n"; $current_template =~ s/^\{\{//; $current_template =~ s/\}\}$//; $current_template =~ s/^ //g; foreach (@namespace_templates){ $current_template =~ s/^$_://i; } $number_of_templates = $number_of_templates + 1; my $template_name = ''; my @template_split = split( /\|/ , $current_template); my $number_of_splits = @template_split; if (index ( $current_template, '|') == -1 ) { # if no pipe; for example {{test}} $template_name = $current_template; next; } if (index ( $current_template, '|') > -1 ) { # templates with pipe {{test|attribute=value}} # get template name $template_split[0] =~ s/^ //g; $template_name = $template_split[0]; #print 'Template name: '.$template_name."\n"; shift(@template_split); # get next part of template my $template_part = ''; my @template_part_array; undef(@template_part_array); foreach (@template_split) { $template_part = $template_part.$_; print "\t".'Test this: '.$template_part."\n" if ($details_for_page eq 'yes'); # check for [] my $template_part1 = $template_part; my $beginn_brackets = ($template_part1 =~ s/\[\[//g); #print "\t\t1 ".$beginn_brackets."\n"; my $template_part2 = $template_part; my $end_brackets = ($template_part2 =~ s/\]\]//g); #print "\t\t2 ".$end_brackets."\n"; #check for {} my $template_part3 = $template_part; my $beginn_curly_brackets = ($template_part3 =~ s/\{\{//g); #print "\t\t3 ".$beginn_curly_brackets."\n"; my $template_part4 = $template_part; my $end_curly_brackets = ($template_part4 =~ s/\}\}//g); #print "\t\t4 ".$end_curly_brackets."\n"; # templet part complete ? if ( $beginn_brackets eq $end_brackets and $beginn_curly_brackets eq $end_curly_brackets ) { push (@template_part_array, $template_part); $template_part = ''; } else { $template_part = $template_part .'|'; } } # OUTPUT If only templates {{{xy|value}} my $template_part_number = -1; my $template_part_without_attribut = -1; foreach (@template_part_array) { my $template_part = $_; #print "\t\t".'Template part: '.$_."\n"; $template_part_number = $template_part_number + 1; $template_part_counter = $template_part_counter +1; $template_name =~ s/^[ ]+//g; $template_name =~ s/[ ]+$//g; $template[$template_part_counter][0] = $number_of_templates; $template[$template_part_counter][1] = $template_name; $template[$template_part_counter][2] = $template_part_number; my $attribut = ''; my $value = ''; if (index($template_part, '=') > -1) { #template part with "=" {{test|attribut=value}} my $pos_equal = index($template_part, '='); my $pos_lower = index($template_part, '<'); my $pos_next_temp = index($template_part, '{{'); my $pos_table = index($template_part, '{|'); my $pos_bracket = index($template_part, '['); my $equal_ok = 'true'; $equal_ok = 'false' if ($pos_lower > -1 and $pos_lower < $pos_equal); $equal_ok = 'false' if ($pos_next_temp > -1 and $pos_next_temp < $pos_equal); $equal_ok = 'false' if ($pos_table > -1 and $pos_table < $pos_equal); $equal_ok = 'false' if ($pos_bracket > -1 and $pos_bracket < $pos_equal); if ($equal_ok eq 'true') { #template part with "=" {{test|attribut=value}} $attribut = substr($template_part, 0, index($template_part, '=')); $value = substr($template_part, index($template_part, '=') +1); } else { # problem: {{test|value<ref name="sdfsdf"> sdfhsdf</ref>}} # problem {{test|value{{test2|name=teste}}|sdfsdf}} $template_part_without_attribut = $template_part_without_attribut +1; $attribut = $template_part_without_attribut; $value = $template_part; } } else { #template part with no "=" {{test|value}} $template_part_without_attribut = $template_part_without_attribut +1; $attribut = $template_part_without_attribut; $value = $template_part; } $attribut =~ s/^[ ]+//g; $attribut =~ s/[ ]+$//g; $value =~ s/^[ ]+//g; $value =~ s/[ ]+$//g; #print 'x'.$attribut."x\tx".$value."x\n" ;#if ($title eq 'Methanol'); $template[$template_part_counter][3] = $attribut; $template[$template_part_counter][4] = $value; $number_of_template_parts = $number_of_template_parts + 1; #print $number_of_template_parts."\n"; $output .= $title."\t"; $output .= $page_id."\t"; $output .= $template[$template_part_counter][0]."\t"; $output .= $template[$template_part_counter][1]."\t"; $output .= $template[$template_part_counter][2]."\t"; $output .= $template[$template_part_counter][3]."\t"; $output .= $template[$template_part_counter][4]."\n"; #print $output."\n" if ($title eq 'Methanol'); } } #print "\n"; # OUTPUT If all templates {{xy}} and {{xy|value}} } #print $output."\n" if ($title eq 'Methanol'); #print $page_namespace."\n" if ($title eq 'Methanol'); if( $output ne '' and $dump_or_live eq 'dump' and ($page_namespace == 0 or $page_namespace == 6) ) { $output =~ s/\n$//; if ((-e $templatetiger_filename)) { $output = "\n".$output; } print $output ."\n" if ($details_for_page eq 'yes'); open (TEMPLATETIGER, '>>'.$templatetiger_filename); print TEMPLATETIGER $output; close (TEMPLATETIGER); $output = ''; } #die if ($title eq 'Methanol'); } ################################################################## sub get_links{ # filter all templates my $pos_start = 0; my $pos_end = 0; my $text_test = $text; #$text_test = 'abc[[Kartographie]], Bild:abd|[[Globus]]]] ohne [[Gradnetz]] weiterer Text #aber hier [[Link234|sdsdlfk]] [[Test]]'; #print $text_test ."\n\n\n"; $text_test =~ s/\n//g; undef (@links_all); while($text_test =~ /\[\[/g) { #Begin of link my $pos_start = pos($text_test) - 2; my $link_text = substr ( $text_test, $pos_start); my $link_text_2 = ''; my $beginn_square_brackets = 1; my $end_square_brackets = 0; while($link_text =~ /\]\]/g) { # Find currect end - number of [[==]] my $pos_end = pos($link_text); $link_text_2 = substr ( $link_text, 0, $pos_end); $link_text_2 = ' '.$link_text_2.' '; #print $link_text_2."\n"; # test the number of [[and ]] my $link_text_2_a = $link_text_2; $beginn_square_brackets = ($link_text_2_a =~ s/\[\[//g); my $link_text_2_b = $link_text_2; $end_square_brackets = ($link_text_2_b =~ s/\]\]//g); #print $beginn_square_brackets .' vs. '.$end_square_brackets."\n"; last if ($beginn_square_brackets eq $end_square_brackets); } if ($beginn_square_brackets == $end_square_brackets ) { # link is correct $link_text_2 = substr ($link_text_2, 1, length($link_text_2) -2); #print 'Link:'.$link_text_2."\n"; push (@links_all, $link_text_2); } else { # template has no correct end $link_text = text_reduce($link_text, 80); error_010_count_square_breaks($link_text); #print 'Error: '.$title.' '.$link_text."\n"; } } #foreach (@links_all) { # print 'Link:'.$_."\n"; #} #die; } sub get_images { # get all images from all links undef (@images_all); my $found_error_text = ''; foreach(@links_all) { $current_link = $_; #print $current_link. "\n"; my $link_is_image = 'no'; foreach (@namespace_image) { my $namespace_image_word = $_; $link_is_image = 'yes' if ( $current_link =~ /^\[\[([ ]?)+?$namespace_image_word:/i); } if ($link_is_image eq 'yes') { # link is a image my $current_image = $current_link; push (@images_all, $current_image); #print "\t".'Image:'."\t".$current_image."\n"; my $test_image = $current_image; #print '1:'."\t".$test_image."\n"; foreach(@magicword_img_thumbnail) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '2:'."\t".$test_image."\n"; foreach(@magicword_img_right) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '3:'."\t".$test_image."\n"; foreach(@magicword_img_left) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '4:'."\t".$test_image."\n"; foreach(@magicword_img_none) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '5:'."\t".$test_image."\n"; foreach(@magicword_img_center) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '6:'."\t".$test_image."\n"; foreach(@magicword_img_framed) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '7:'."\t".$test_image."\n"; foreach(@magicword_img_frameless) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '8:'."\t".$test_image."\n"; foreach(@magicword_img_border) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '9:'."\t".$test_image."\n"; foreach(@magicword_img_sub) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '10:'."\t".$test_image."\n"; foreach(@magicword_img_super) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '11:'."\t".$test_image."\n"; foreach(@magicword_img_baseline) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '12:'."\t".$test_image."\n"; foreach(@magicword_img_top) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '13:'."\t".$test_image."\n"; foreach(@magicword_img_text_top) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '14:'."\t".$test_image."\n"; foreach(@magicword_img_middle) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } #print '15:'."\t".$test_image."\n"; foreach(@magicword_img_bottom) { my $current_magicword = $_; #print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+$current_magicword([ ]?)+(\||\])/$3/i ; } ####### # special # 100px # 100x100px #print '16:'."\t".$test_image."\n"; #foreach(@magicword_img_width) { # my $current_magicword = $_; # $current_magicword =~ s/$1/[0-9]+/; ## print $current_magicword."\n"; $test_image =~ s/\|([ ]?)+[0-9]+(x[0-9]+)?px([ ]?)+(\||\])/$4/i ; #} #print '17:'."\t".$test_image."\n"; if ($found_error_text eq '') { if (index($test_image, '|') == -1) { # [[Image:Afriga3.svg]] $found_error_text = $current_image; } else { my $pos_1 = index($test_image, '|'); my $pos_2 = index($test_image, '|', $pos_1+1); #print '1:'."\t".$pos_1."\n"; #print '2:'."\t".$pos_2."\n"; if ( $pos_2 == -1 and index($test_image, '|]') > -1 ) { # [[Image:Afriga3.svg|]] $found_error_text = $current_image; #print 'Error'."\n"; } } } } } if ($found_error_text ne '') { error_030_image_without_description( $found_error_text ); } } ################################################################## sub get_tables { # search for comments in this page # save comments in Array # replace comments with space #print 'get comment'."\n"; my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes'; do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes'; #get position of next comment $pos_start = index ( $text, '{|', $pos_start_old); $pos_end = index ( $text, '|}', $pos_start ) ; #print 'get table: x'.substr ($text, $pos_end, 3 )."x\n"; if ($pos_start > -1 and $pos_end >-1 and substr ($text, $pos_end, 3 ) ne '|}}' ) { #found a comment in current page $pos_end = $pos_end + length('|}'); #$comment_counter = $comment_counter +1; #$comments[$comment_counter][0] = $pos_start; #$comments[$comment_counter][1] = $pos_end; #$comments[$comment_counter][2] = substr($text, $pos_start, $pos_end - $pos_start ); #print 'Begin='.$comments[$comment_counter][0].' End='.$comments[$comment_counter][1]."\n"; #print 'Comment='.$comments[$comment_counter][2]."\n"; $end_search = 'no'; $pos_start_old = $pos_end; #replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ''; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { error_028_table_no_correct_end ( substr( $text, $pos_start, 50) ); $end_search = 'yes'; } } until ( $end_search eq 'yes') ; } sub get_gallery { my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes'; do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes'; $pos_start = index ( $text, '<gallery', $pos_start_old); $pos_end = index ( $text, '</gallery>', $pos_start ) ; if ($pos_start > -1 and $pos_end >-1) { $pos_end = $pos_end + length('</gallery>'); $end_search = 'no'; $pos_start_old = $pos_end; #replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $text_gallery = substr( $text, $pos_start, $pos_end - $pos_start ); error_035_gallery_without_description($text_gallery); my $filler = ''; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { error_029_gallery_no_correct_end ( substr( $text, $pos_start, 50) ); $end_search = 'yes'; } } until ( $end_search eq 'yes') ; } sub get_hiero { #print 'Get hiero tag'."\n"; my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes'; do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes'; #get position of next <math> $pos_start = index ( $text, '<hiero>', $pos_start_old); $pos_end = index ( $text, '</hiero>', $pos_start ) ; if ($pos_start > -1 and $pos_end >-1) { #found a math in current page $pos_end = $pos_end + length('</hiero>'); #print substr($text, $pos_start, $pos_end - $pos_start )."\n"; $end_search = 'no'; $pos_start_old = $pos_end; #replace comment with space my $text_before = substr( $text, 0, $pos_start ); my $text_after = substr( $text, $pos_end ); my $filler = ''; for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { $filler = $filler.' '; } $text = $text_before.$filler.$text_after; } if ($pos_start > -1 and $pos_end == -1) { #error_015_Code_no_correct_end ( substr( $text, $pos_start, 50) ); #print 'Code:'.substr( $text, $pos_start, 50)."\n"; $end_search = 'yes'; } } until ( $end_search eq 'yes') ; } sub get_ref { #print 'Get hiero tag'."\n"; undef (@ref); my $pos_start_old = 0; my $pos_end_old = 0; my $end_search = 'yes'; do { my $pos_start = 0; my $pos_end = 0; $end_search = 'yes'; #get position of next <math> $pos_start = index ( $text, '<ref>', $pos_start_old); $pos_end = index ( $text, '</ref>', $pos_start ) ; if ($pos_start > -1 and $pos_end >-1) { #found a math in current page $pos_end = $pos_end + length('</ref>'); #print substr($text, $pos_start, $pos_end - $pos_start )."\n"; $end_search = 'no'; $pos_start_old = $pos_end; #print $pos_start." ".$pos_end."\n"; my $new_ref = substr($text, $pos_start, $pos_end - $pos_start); #print $new_ref."\n"; push(@ref, $new_ref ); } } until ( $end_search eq 'yes') ; } sub check_for_redirect { # is this page a redirect? if (index(lc($text), '#redirect') > -1) { $page_is_redirect = 'yes'; } } sub get_categories { # search for categories in this page # save comments in Array # replace comments with space #print 'get categories'."\n"; #$text = 'absc[[ Kategorie:123|Museum]],Kategorie:78]][[ Category:ABC-Waffe| Kreuz ]][[Category:XY-Waffe|Hand ]] [[ category:Schwert| Fuss]] [[Kategorie:Karto]][[kategorie:Karto]]'; #print $text."\n"; #foreach (@namespace_cat) { # print $_."\n"; #} foreach (@namespace_cat) { my $namespace_cat_word = $_; #print "namespace_cat_word:".$namespace_cat_word."x\n"; my $pos_start = 0; my $pos_end = 0; my $text_test = $text; my $search_word = $namespace_cat_word; while($text_test =~ /\[\[([ ]+)?($search_word:)/ig) { my $pos_start = pos($text_test) - length($search_word) - 1; #print "search word <b>$search_word</b> gefunden bei Position $pos_start<br>\n"; $pos_end = index ( $text_test, ']]', $pos_start ) ; my $counter_begin = 0; do { $pos_start = $pos_start -1; $counter_begin = $counter_begin + 1 if (substr($text_test, $pos_start, 1) eq '[' ); } until ($counter_begin == 2); #print $namespace_cat."\n"; #print $pos_start."\n"; #print $pos_end."\n"; if ($pos_start > -1 and $pos_end >-1) { #found a comment in current page $pos_end = $pos_end + length(']]'); $category_counter = $category_counter +1; $category[$category_counter][0] = $pos_start; $category[$category_counter][1] = $pos_end; $category[$category_counter][2] = ''; $category[$category_counter][3] = ''; $category[$category_counter][4] = substr($text_test, $pos_start, $pos_end - $pos_start); #print $category[$category_counter][4]."\n";# if ($title eq 'Alain Delon'); #replace comment with space #my $text_before = substr( $text, 0, $pos_start ); #my $text_after = substr( $text, $pos_end ); #my $filler = ''; #for (my $i = 0; $i < ($pos_end-$pos_start); $i++) { # $filler = $filler.' '; #} #$text = $text_before.$filler.$text_after; #filter catname $category[$category_counter][2] = $category[$category_counter][4]; $category[$category_counter][2] =~ s/\[\[//g; #delete space $category[$category_counter][2] =~ s/^([ ]+)?//g; #delete blank before text $category[$category_counter][2] =~ s/\]\]//g; #delete ]] $category[$category_counter][2] =~ s/^$namespace_cat_word//i; #delete ]] $category[$category_counter][2] =~ s/^://; #delete ]] $category[$category_counter][2] =~ s/\|(.)*//g; #delete |xy #$category[$category_counter][2] =~ s/^(.)*://i; #delete [[category: $category[$category_counter][2] =~ s/^ //g; #delete blank before text $category[$category_counter][2] =~ s/ $//g; #delete blank after text #filter linkname $category[$category_counter][3] = $category[$category_counter][4]; $category[$category_counter][3] = '' if (index ($category[$category_counter][3], '|') == -1); $category[$category_counter][3] =~ s/^(.)*\|//gi; #delete [[category:xy| $category[$category_counter][3] =~ s/\]\]//g; #delete ]] $category[$category_counter][3] =~ s/^ //g; #delete blank before text $category[$category_counter][3] =~ s/ $//g; #delete blank after text #if ($title eq 'Alain Delon') { #print "\t".'Begin='.$category[$category_counter][0].' End='.$category[$category_counter][1]."\n"; #print "\t".'catname=' .$category[$category_counter][2]."\n"; #print "\t".'linkname='.$category[$category_counter][3]."\n"; #print "\t".'full cat='.$category[$category_counter][4]."\n"; #} } } } #if ($title eq 'Alain Delon') { #print $title."\n"; #for (my $i = 0; $i <= $category_counter; $i++) { # print $i.'@'.$category[$i][4]."@\n"; #} #die; #} } sub get_interwikis{ foreach (@inter_list) { my $current_lang = $_; #print "namespace_cat_word:".$namespace_cat_word."x\n"; my $pos_start = 0; my $pos_end = 0; my $text_test = $text; my $search_word = $current_lang; while($text_test =~ /\[\[([ ]+)?($search_word:)/ig) { my $pos_start = pos($text_test) - length($search_word) - 1; #print "search word <b>$search_word</b> gefunden bei Position $pos_start<br>\n"; $pos_end = index ( $text_test, ']]', $pos_start ) ; my $counter_begin = 0; do { $pos_start = $pos_start -1; $counter_begin = $counter_begin + 1 if (substr($text_test, $pos_start, 1) eq '[' ); } until ($counter_begin == 2); #print $namespace_cat."\n"; #print $pos_start."\n"; #print $pos_end."\n"; if ($pos_start > -1 and $pos_end >-1) { #found a comment in current page $pos_end = $pos_end + length(']]'); $interwiki_counter = $interwiki_counter +1; $interwiki[$interwiki_counter][0] = $pos_start; $interwiki[$interwiki_counter][1] = $pos_end; $interwiki[$interwiki_counter][2] = ''; $interwiki[$interwiki_counter][3] = ''; $interwiki[$interwiki_counter][4] = substr($text_test, $pos_start, $pos_end - $pos_start); $interwiki[$interwiki_counter][2] = $interwiki[$interwiki_counter][4]; $interwiki[$interwiki_counter][2] =~ s/\]\]//g; #delete ]] $interwiki[$interwiki_counter][2] =~ s/\|(.)*//g; #delete |xy $interwiki[$interwiki_counter][2] =~ s/^(.)*://gi; #delete [[category: $interwiki[$interwiki_counter][2] =~ s/^ //g; #delete blank before text $interwiki[$interwiki_counter][2] =~ s/ $//g; #delete blank after text #filter linkname $interwiki[$interwiki_counter][3] = $interwiki[$interwiki_counter][4]; $interwiki[$interwiki_counter][3] = '' if (index ($interwiki[$interwiki_counter][3], '|') == -1); $interwiki[$interwiki_counter][3] =~ s/^(.)*\|//gi; #delete [[category:xy| $interwiki[$interwiki_counter][3] =~ s/\]\]//g; #delete ]] $interwiki[$interwiki_counter][3] =~ s/^ //g; #delete blank before text $interwiki[$interwiki_counter][3] =~ s/ $//g; #delete blank after text #language $interwiki[$interwiki_counter][5] = $current_lang; #$interwiki[$interwiki_counter][5] = $interwiki[$interwiki_counter][4]; #$interwiki[$interwiki_counter][5] =~ s/:(.)*//gi; #$interwiki[$interwiki_counter][5] =~ s/\[\[//g; #delete [[ #if ($title eq 'JPEG') { #print "\t".'Begin='.$interwiki[$interwiki_counter][0].' End='.$interwiki[$interwiki_counter][1]."\n"; #print "\t".'full interwiki='.$interwiki[$interwiki_counter][4]."\n"; #print "\t".'language='.$interwiki[$interwiki_counter][5]."\n"; #print "\t".'interwikiname='.$interwiki[$interwiki_counter][2]."\n"; #print "\t".'linkname='.$interwiki[$interwiki_counter][3]."\n"; #} } } } #if ($title eq 'Alain Delon') { #print $title."\n"; #for (my $i = 0; $i <= $category_counter; $i++) { # print $i.'@'.$category[$i][4]."@\n"; #} #die; #} } sub create_line_array{ @lines = split (/\n/, $text); } sub get_line_first_blank{ undef(@lines_first_blank); #my $yes_blank = 'no'; foreach(@lines) { my $current_line = $_; if ( $current_line =~ /^ [^ ]/ and $current_line =~ /^ [^\|]/ # no table and $current_line =~ /^ [^\!]/ #no table ) { push(@lines_first_blank, $current_line); #$yes_blank = 'yes'; } } #if ($yes_blank eq 'yes') { #print "Title:".$title."\n"; #my $test_num = @lines_first_blank; #print $test_num."\n"; #foreach(@lines_first_blank) { # print $_."\n"; #} #die; #} } sub get_headlines{ undef(@headlines); my $section_text = ''; #get headlines foreach(@lines) { my $current_line = $_; if (substr($current_line ,0 ,1) eq '=') { push(@section, $section_text) if ($section_text ne ''); $section_text = ''; push(@headlines, $current_line); } $section_text = $section_text.$_; } push(@section, $section_text) if ($section_text ne ''); #foreach(@headlines) { # print $_."\n"; #} } ############################################################################ sub error_check { print 'Start check error'."\n" if ($details_for_page eq 'yes'); if ( $dump_or_live eq 'dump' or $dump_or_live eq 'live') { error_001_no_bold_title(); # don´t work - deactivated error_002_have_br(); error_003_have_ref(); error_004_have_html_and_no_topic(); error_005_Comment_no_correct_end(''); error_006_defaultsort_with_special_letters(); error_007_headline_only_three(); error_008_headline_start_end(); error_009_more_then_one_category_in_a_line(); error_010_count_square_breaks(''); error_011_html_names_entities(); error_012_html_list_elements(); error_013_Math_no_correct_end(''); error_014_Source_no_correct_end(''); error_015_Code_no_correct_end(''); error_016_unicode_control_characters(); error_017_category_double(); error_018_category_first_letter_small(); error_019_headline_only_one(); error_020_symbol_for_dead(); error_021_category_is_english(); error_022_category_with_space(); error_023_nowiki_no_correct_end(''); error_024_pre_no_correct_end(''); error_025_headline_hierarchy(); error_026_html_text_style_elements(); error_027_unicode_syntax(); error_028_table_no_correct_end(''); error_029_gallery_no_correct_end(''); error_030_image_without_description(''); error_031_html_table_elements(); error_032_double_pipe_in_link(); error_033_html_text_style_elements_underline(); error_034_template_programming_elements(); error_035_gallery_without_description(''); error_036_redirect_not_correct(); error_037_title_with_special_letters_and_no_defaultsort(); error_038_html_text_style_elements_italic(); error_039_html_text_style_elements_paragraph(); error_040_html_text_style_elements_font(); error_041_html_text_style_elements_big(); error_042_html_text_style_elements_small(); error_043_template_no_correct_end(''); error_044_headline_with_bold(); error_045_interwiki_double(); error_046_count_square_breaks_begin(); error_047_template_no_correct_begin(); error_048_title_in_text(); error_049_headline_with_html(); error_050_dash(); error_051_interwiki_before_last_headline(); error_052_category_before_last_headline(); error_053_interwiki_before_category(); error_054_break_in_list(); error_055_html_text_style_elements_small_double(); error_056_arrow_as_ASCII_art(); error_057_headline_end_with_colon(); error_058_headline_with_capitalization(); error_059_template_value_end_with_br(); error_060_template_parameter_with_problem(); error_061_reference_with_punctuation(); error_062_headline_alone(); error_063_html_text_style_elements_small_ref_sub_sup(); error_064_link_equal_linktext(); error_065_image_description_with_break(); error_066_image_description_with_full_small(); error_067_reference_after_punctuation(); error_068_link_to_other_language(); error_069_isbn_wrong_syntax(''); error_070_isbn_wrong_length(''); error_071_isbn_wrong_pos_X(''); error_072_isbn_10_wrong_checksum(''); error_073_isbn_13_wrong_checksum(''); error_074_link_with_no_target(); error_075_indented_list(); error_076_link_with_no_space(); error_077_image_description_with_partial_small(); error_078_reference_double(); error_079_external_link_without_description(); error_080_external_link_with_line_break(); error_081_ref_double(); error_082_link_to_other_wikiproject(); error_083_headline_only_three_and_later_level_two(); } if ( $dump_or_live eq 'only'){ error_030_image_without_description(''); } ############# # next feature ## comment_very_long; } ################################### sub error_001_no_bold_title { my $error_code = 1; $error_description[$error_code][0] = -1; $error_description[$error_code][1] = 'No bold title'; $error_description[$error_code][2] = 'This article has no bold title like <nowiki>'."'''Title'''".'</nowiki>.'; print $error_code."\n" if ($details_for_page eq 'yes'); if ($page_namespace == 0 and index( $text, "'''" )== -1 and $page_is_redirect eq 'no') { error_register($error_code, ''); #print "\t". $error_code."\t".$title."\n"; } } sub error_002_have_br{ my $error_code = 2; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Article with false <nowiki><br/></nowiki>'; $error_description[$error_code][2] = 'This article contains a <nowiki><br\></nowiki> or <nowiki><\br></nowiki> or <nowiki><br.></nowiki> but a <nowiki><br></br> or <br/></nowiki> tag is necessary in order to be correct XHTML-syntax (see [http://www.w3.org/TR/xhtml1/#h-4.6 1], [http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags 2]).'; print $error_code."\n" if ($details_for_page eq 'yes'); my $test = 'no found'; my $test_line = ''; if ( $page_namespace == 0 ) { my $test_text = lc($text); if (index($test_text, '<br') > -1 or index($test_text, 'br>') > -1) { my $pos = -1; foreach (@lines) { my $current_line = $_; my $current_line_lc = lc($current_line); #print $current_line_lc."\n"; if ($current_line_lc =~ /<br\/[^ ]>/g ){ # <br/1> $pos = pos($current_line_lc) if ( $pos == -1); } if ($current_line_lc =~ /<br[^ ]\/>/g ){ # <br1/> $pos = pos($current_line_lc) if ( $pos == -1); } if ($current_line_lc =~ /<br[^ \/]>/g ) { # <br7> $pos = pos($current_line_lc) if ( $pos == -1); } if ($current_line_lc =~ /<[^ \/]br>/g ) { # <\br> $pos = pos($current_line_lc) if ($pos == -1); } if ($pos > -1 and $test ne 'found'){ #print $pos."\n"; $test = 'found'; if ($test_line eq '') { $test_line = substr($current_line, 0, $pos) ; $test_line = text_reduce_to_end( $test_line, 50); #print $test_line."\n"; } } } } } if ($test eq 'found' ) { $test_line = text_reduce($test_line, 80); error_register($error_code, '<nowiki>'.$test_line.' </nowiki>'); #print "\t". $error_code."\t".$title."\t".$test_line."\n"; } } sub error_003_have_ref{ my $error_code = 3; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'Article with <nowiki><ref></nowiki> and no <nowiki><references /></nowiki>'; $error_description[$error_code][2] = 'This article has a <nowiki><ref></nowiki> and not a <nowiki><references /></nowiki>. This is not correct syntax.'; print $error_code."\n" if ($details_for_page eq 'yes'); if ($page_namespace == 0) { if ( index($text, '<ref>') > -1 or index($text, '<ref name') > -1 ) { my $test = "false"; my $test_text = lc($text); $test = "true" if ( $test_text =~ /<[ ]?+references[ ]?+\/>/); $test = "true" if ( $test_text =~ /<[ ]?+references group/); $test = "true" if ( $test_text =~ /\{\{[ ]?+refbegin/); $test = "true" if ( $test_text =~ /\{\{[ ]?+refend/); $test = "true" if ( $test_text =~ /\{\{[ ]?+reflist/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+reflink/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+reference list/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+references-small/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+references/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+listaref /); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+reference/); # in enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+przypisy/); # in plwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+amaga/); # in cawiki $test = "true" if ( $test_text =~ /\{\{[ ]?+referències/); # in cawiki $test = "true" if ( $test_text =~ /\{\{[ ]?+viitteet/); # in fiwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+verwysings/); # in afwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+references/); # in itwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+références/); # in frwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+notes/); # in frwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+listaref/); # in nlwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+referenties/); # in cawiki $test = "true" if ( $test_text =~ /\{\{[ ]?+ref-section/); # in ptwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+refs/); # in nlwiki + enwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+noot/); # in nlwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+unreferenced/); # in nlwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+fnb/); # in nlwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+примечания/); # in ruwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+список примечаний/); # in ruwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+Примечания/); # in ruwiki (Problem with big letters) $test = "true" if ( $test_text =~ /\{\{[ ]?+Список примечаний/); # in ruwiki (Problem with big letters) $test = "true" if ( $test_text =~ /\{\{[ ]?+kaynakça/ ); # in trwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+ثبت المراجع/ ); # in arwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+appendix/ ); # in nlwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+примітки/ ); # in ukwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+Примітки/ ); # in ukwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+hide ref/ ); # in zhwiki $test = "true" if ( $test_text =~ /\{\{[ ]?+forrás/ ); # in huwiki if ($test eq "false") { error_register($error_code, ''); #print "\t". $error_code."\t".$title."\n"; #die; } } } } sub error_004_have_html_and_no_topic{ my $error_code = 4; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'Article with weblink'; $error_description[$error_code][2] = 'This article has a weblink and not a headline (like "<nowiki>== Weblinks ==</nowiki>"). All weblinks should be in the linklist or list of references.'; print $error_code."\n" if ($details_for_page eq 'yes'); if ( $page_namespace == 0 and index($text, 'http://') > -1 and index($text, '==') == -1 and index($text, '{{') == -1 and $project eq 'dewiki' and index($text, '<references') == -1 and index($text, '<ref>') == -1 ) { error_register($error_code, ''); #print "\t". $error_code."\t".$title."\n"; #die; } } sub error_005_Comment_no_correct_end{ my $error_code = 5; $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Comment not correct end'; $error_description[$error_code][2] = 'Found a comment <nowiki>"<!--"</nowiki> with no <nowiki>"-->"</nowiki> end.'; print $error_code."\n" if ($details_for_page eq 'yes'); if ($comment ne '' and ( $page_namespace == 0 or $page_namespace == 6 ) ) { error_register($error_code, '<nowiki>'.$comment.'</nowiki>'); #print "\t". $error_code."\t".$title."\n"; } } sub error_006_defaultsort_with_special_letters{ my $error_code = 6; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'DEFAULTSORT with special letters'; $error_description[$error_code][2] = 'Please don´t use special letters in the DEFAULTSORT (in ca: also in ORDENA).'."\n". '* in de: ä → a, ö → o, ü → u, ß → ss '."\n". '* in fi: ü → y, é → e, ß → ss, etc.'."\n". '* in sv and fi is allowed ÅÄÖåäö'."\n". '* in cs is allowed čďěňřšťžČĎŇŘŠŤŽ'."\n". '* in da, no, nn is allowed ÆØÅæøå'."\n". '* in ro is allowed ăîâşţ'."\n". '* in ru: Ё → Е, ё → е'."\n". "\n"; print $error_code."\n" if ($details_for_page eq 'yes'); # {{DEFAULTSORT:Mueller, Kai}} # {{ORDENA:Alfons I}} if ( $page_namespace == 0 and $project ne 'arwiki' and $project ne 'hewiki' and $project ne 'plwiki' and $project ne 'jawiki' and $project ne 'yiwiki' and $project ne 'zhwiki' ) { my $pos1 = -1; foreach (@magicword_defaultsort) { $pos1 = index($text, $_) if ($pos1 == -1); } if ($pos1 > -1 ) { $pos2 = index(substr($text,$pos1), '}}'); $testtext = substr($text, $pos1, $pos2); my $testtext_2 = $testtext; #my $testtext =~ s/{{DEFAULTSORT\s*:(.*)}}/$1/; #print $testtext."\n"; $testtext =~ s/[-—–:,\.0-9 A-Za-z!\?']//g; $testtext =~ s/[&]//g; $testtext =~ s/#//g; $testtext =~ s/\///g; $testtext =~ s/\(//g; $testtext =~ s/\)//g; $testtext =~ s/\*//g; $testtext =~ s/[ÅÄÖåäö]//g if ($project eq 'svwiki'); # For Swedish, ÅÄÖ should also be allowed $testtext =~ s/[ÅÄÖåäö]//g if ($project eq 'fiwiki'); # For Finnish, ÅÄÖ should also be allowed $testtext =~ s/[čďěňřšťžČĎŇŘŠŤŽ]//g if ($project eq 'cswiki'); $testtext =~ s/[ÆØÅæøå]//g if ($project eq 'dawiki'); $testtext =~ s/[ÆØÅæøå]//g if ($project eq 'nowiki'); $testtext =~ s/[ÆØÅæøå]//g if ($project eq 'nnwiki'); $testtext =~ s/[ăîâşţ]//g if ($project eq 'rowiki'); $testtext =~ s/[АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯабвгдежзийклмнопрстуфхцчшщьыъэюя]//g if ($project eq 'ruwiki'); $testtext =~ s/[АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯабвгдежзийклмнопрстуфхцчшщьыъэюяіїґ]//g if ($project eq 'ukwiki'); $testtext =~ s/[~]//g if ($project eq 'huwiki'); # ~ for special letters #if ($testtext ne '') error_register(…); #print $testtext."\n"; if ( ( $testtext ne '' ) # normal article #or ($testtext ne '' and $page_namespace != 0 and index($text, '{{DEFAULTSORT') > -1 ) # if not an article then wiht {{ }} ){ $testtext = text_reduce($testtext, 80); $testtext_2 = text_reduce($testtext_2, 80); error_register($error_code, '<nowiki>'.$testtext.'</nowiki> || <nowiki>'.$testtext_2.'</nowiki>'); #print "\t". $error_code."\t".$title."\t".$testtext."\n"; #die; } } #die; } } sub error_006_defaultsort_with_german_letters_old{ my $error_code = 6; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'DEFAULTSORT with german letters'; $error_description[$error_code][2] = 'Please don´t use german letters in the DEFAULTSORT. Use a for ä, o for ö, u for ü and ss for ß.'; print $error_code."\n" if ($details_for_page eq 'yes'); # {{DEFAULTSORT:Mueller, Kai}} if ( index($text, 'DEFAULTSORT') > -1 ) { my $pos1 = index($text, 'DEFAULTSORT'); my $pos2 = index(substr($text,$pos1), '}}'); my $testtext = substr($text, $pos1, $pos2); #print $testtext."\n"; if ( index ($testtext, 'ä') >-1 or index ($testtext, 'ü') >-1 or index ($testtext, 'ö') >-1 or index ($testtext, 'ß') >-1 or index ($testtext, 'Ä') >-1 or index ($testtext, 'Ü') >-1 or index ($testtext, 'Ö') >-1 ){ error_register($error_code, '<nowiki>'.$testtext.'</nowiki>'); #print "\t". $error_code."\t".$title."\t".$testtext."\n"; #die; } } } sub error_007_headline_only_three{ my $error_code = 7; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Headlines start with three "="'; $error_description[$error_code][2] = 'The first headline start with <nowiki>"=== XY ==="</nowiki>. It should only be <nowiki>"== XY =="</nowiki>. See also error 083!'; print $error_code."\n" if ($details_for_page eq 'yes'); if ( $headlines[0] and $page_namespace == 0){ if ( $headlines[0] =~ /===/ ){ my $found_level_two = 'no'; foreach (@headlines) { if ($_ =~ /^==[^=]/) { $found_level_two = 'yes'; #found level two (error 83) } } if ($found_level_two eq 'no') { error_register($error_code, '<nowiki>'.$headlines[0].'</nowiki>'); #print "\t". $error_code."\t".$title."\t".'<nowiki>'.$headlines[0].'</nowiki>'."\n"; #die; } } } } sub error_008_headline_start_end{ my $error_code = 8; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Headline should end with "="'; $error_description[$error_code][2] = 'A headline should end with an "=".'; print $error_code."\n" if ($details_for_page eq 'yes'); foreach (@headlines) { my $current_line = $_; my $current_line1 = $current_line; my $current_line2 = $current_line; $current_line2 =~ s/\t//gi; $current_line2 =~ s/[ ]+/ /gi; $current_line2 =~ s/ $//gi; if ( $current_line1 =~ /^==/ and not ($current_line2 =~ /==$/) and index ($current_line ,'<ref') == -1 and $page_namespace == 0 ) { $current_line = text_reduce($current_line, 80); error_register($error_code, '<nowiki>'.$current_line.'</nowiki>'); #print "\t". $error_code."\t".$title."\t".'<nowiki>'.$current_line.'</nowiki>'."\n"; #if ($title eq '28 april'){ # my $test_length = length($current_line); # for (my $i =0; $i<= $test_length; $i++) { # my $test_text = substr($current_line, $i, 1); # print $test_text."\t".ord($test_text)."\n"; # } #} } } } sub error_009_more_then_one_category_in_a_line{ my $error_code = 9; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'Categories more at one line'; $error_description[$error_code][2] = 'There is more then one category at one line. Please write only one at one line. It is better to read.'; my $error_line = ''; print $error_code."\n" if ($details_for_page eq 'yes'); foreach (@lines) { my $current_line = $_; my $found = 0; foreach (@namespace_cat) { my $namespace_cat_word = $_; $found = $found +1 if ( $current_line =~ /\[\[([ ]+)?($namespace_cat_word:)/ig); } if ($found > 1 and $page_namespace == 0 ) { $error_line = $current_line; } } if ($error_line ne '') { error_register($error_code, '<nowiki>'.$error_line.'</nowiki>'); #print "\t". $error_code."\t".$title."\t".'<nowiki>'.$error_line.'</nowiki>'."\n"; } } sub error_010_count_square_breaks{ my $error_code = 10; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Square brackets not correct end'; $error_description[$error_code][2] = 'Different number of <nowiki>[[</nowiki> and <nowiki>]]</nowiki> brackets. If it is sourcecode then use <nowiki><source> or <code></nowiki>.'; print $error_code."\n" if ($details_for_page eq 'yes'); my $comment = $_[0]; if ($comment ne '' and ($page_namespace == 0 or $page_namespace == 6) ) { $comment = text_reduce($comment, 80); error_register($error_code, '<nowiki>'.$comment.'</nowiki>'); #print "\t". $error_code."\t".$title."\n"; } } sub error_011_html_names_entities { my $error_code = 11; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'HTML named entities'; $error_description[$error_code][2] = 'Find <tt>&a<code></code>uml;</tt> or <tt>&o<code></code>uml;</tt> or <tt>&u<code></code>uml;</tt>, <tt>&sz<code></code>lig;</tt> or other. Please use [[Unicode]] characters (äüöÄÜÖßåÅ…).'; print $error_code."\n" if ($details_for_page eq 'yes'); $error_description[$error_code][2] = infotext_change_error( $error_description[$error_code][2] ); $error_description[$error_code][2] = infotext_new_error( $error_description[$error_code][2] ); if ($page_namespace == 0 or $page_namespace == 6) { my $pos = -1; my $test_text = lc($text); # see http://turner.faculty.swau.edu/webstuff/htmlsymbols.html $pos = index( $test_text, 'ä') if ($pos == -1); $pos = index( $test_text, 'ö') if ($pos == -1); $pos = index( $test_text, 'ü') if ($pos == -1); $pos = index( $test_text, 'ß') if ($pos == -1); $pos = index( $test_text, 'å') if ($pos == -1); # åÅ $pos = index( $test_text, '…') if ($pos == -1); # … #$pos = index( $test_text, '<') if ($pos == -1); # for example, <em> produces <em> for use in examples #$pos = index( $test_text, '>') if ($pos == -1); #$pos = index( $test_text, '&') if ($pos == -1); # For example, in en:Beta (letter), the code: &beta; is used to add "&beta" to the page's display, rather than the unicode character β. $pos = index( $test_text, '"') if ($pos == -1); $pos = index( $test_text, '−') if ($pos == -1); $pos = index( $test_text, '‾') if ($pos == -1); $pos = index( $test_text, '¢') if ($pos == -1); $pos = index( $test_text, '£') if ($pos == -1); $pos = index( $test_text, '€') if ($pos == -1); $pos = index( $test_text, '§') if ($pos == -1); $pos = index( $test_text, '†') if ($pos == -1); $pos = index( $test_text, '‘') if ($pos == -1); $pos = index( $test_text, '’') if ($pos == -1); $pos = index( $test_text, '·') if ($pos == -1); $pos = index( $test_text, '•') if ($pos == -1); $pos = index( $test_text, '©') if ($pos == -1); $pos = index( $test_text, '®') if ($pos == -1); $pos = index( $test_text, '™') if ($pos == -1); $pos = index( $test_text, '¿') if ($pos == -1); $pos = index( $test_text, '¡') if ($pos == -1); $pos = index( $test_text, 'æ') if ($pos == -1); $pos = index( $test_text, 'ç') if ($pos == -1); $pos = index( $test_text, 'ñ') if ($pos == -1); $pos = index( $test_text, 'â') if ($pos == -1); $pos = index( $test_text, 'á') if ($pos == -1); $pos = index( $test_text, 'à') if ($pos == -1); #arrows $pos = index( $test_text, '↓') if ($pos == -1); $pos = index( $test_text, '↑') if ($pos == -1); $pos = index( $test_text, '↵') if ($pos == -1); $pos = index( $test_text, '→') if ($pos == -1); $pos = index( $test_text, '←') if ($pos == -1); $pos = index( $test_text, '↔') if ($pos == -1); if ($pos > -1) { my $found_text = substr ( $text , $pos); $found_text = text_reduce($found_text, 80); $found_text =~ s/&/&/g; error_register($error_code, '<nowiki>'.$found_text.'</nowiki>'); #print "\t". $error_code."\t".$title."\t".$found_text."\n"; } } } sub error_012_html_list_elements{ my $error_code = 12; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'HTML List elements'; $error_description[$error_code][2] = 'Article contains a <nowiki>"<ol>", "<ul>" or "<li>"</nowiki>. '."In most cases we can use simpler wiki markups in place of these HTML-like tags."; print $error_code."\n" if ($details_for_page eq 'yes'); my $test = 'no found'; my $test_line = ''; my $test_text = lc($text); if (index($test_text, '<ol') > -1 or index($test_text, '<ul') > -1 or index($test_text, '<li>') > -1) { foreach (@lines) { my $current_line = $_; my $current_line_lc = lc($current_line); #get position of categorie if ( $page_namespace == 0 and index( $text, '<ol start') == -1 and index( $text, '<ol type') == -1 and index( $text, '<ol style="list-style-type:lower-roman">') == -1 and index( $text, '<ol style="list-style-type:lower-alpha">') == -1 and ( index( $current_line_lc, '<ol>') > -1 or index( $current_line_lc, '<ul>') > -1 or index( $current_line_lc, '<li>') > -1 )) { $test = 'found'; $test_line = $current_line if ($test_line eq ''); } } } if ($test eq 'found' ) { $test_line = text_reduce($test_line, 80); error_register($error_code, '<nowiki>'.$test_line.' </nowiki>'); #print "\t". $error_code."\t".$title."\t".$test_line."\n"; } } sub error_013_Math_no_correct_end{ my $error_code = 13; $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Math not correct end'; $error_description[$error_code][2] = 'Found a <nowiki>"<math>"</nowiki> but no <nowiki>"} "</nowiki>.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_014_Source_no_correct_end{ my $error_code = 14; $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Source not correct end'; $error_description[$error_code][2] = 'Found a "<source …>" but no "</source>".'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_015_Code_no_correct_end{ my $error_code = 15; my $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Code not correct end'; $error_description[$error_code][2] = 'Found a "<code>" but no "</code>".'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_016_unicode_control_characters{
my $error_code = 16;
$error_description[$error_code][0] = 1;
$error_description[$error_code][1] = 'Template with Unicode control characters';
$error_description[$error_code][2] = 'Find Unicode control characters  or &#x
200E; or &#x
200B; (en:Left-to-right_mark, en:Right-to-left mark, en:Byte-order mark). This could be a problem inside a template. Copy the template in a texteditor (for example Notepad++), where you see the controle characters and delete this. Copy then this text back in the article.';
$error_description[$error_code][2] = infotext_change_error( $error_description[$error_code][2] );
$error_description[$error_code][2] = infotext_new_error( $error_description[$error_code][2] );
print $error_code."\n" if ($details_for_page eq 'yes');
if ($page_namespace == 0 or $page_namespace == 6) { foreach (@templates_all) { my $template_text = $_; my $pos = -1; #$pos = index( $text, '') if ($pos == -1); # l in Wrozlaw #$pos = index( $text, '') if ($pos == -1); # l in Wrozlaw #$pos = index( $text, '') if ($pos == -1); # – $pos = index( $template_text, '') if ($pos == -1); # $pos = index( $template_text, '') if ($pos == -1); # $pos = index( $template_text, '') if ($pos == -1); #
if ($pos > -1) { my $found_text = substr ( $template_text , $pos); $found_text = text_reduce($found_text, 80); error_register($error_code, ''.$found_text.''); #print "\t". $error_code."\t".$title."\t".$found_text."\n"; } } } }
sub error_017_category_double{ my $error_code = 17; $comment = $_[0]; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'Category double'; $error_description[$error_code][2] = 'In this article is a category double.'; print $error_code."\n" if ($details_for_page eq 'yes');
for (my $i = 0; $i <= $category_counter-1; $i++) { my $test1 = $category[$i][2];
$test1 = uc(substr($test1,0,1)).substr($test1,1); #first letter big
for (my $j = $i+1; $j <= $category_counter; $j++) {
my $test2 = $category[$j][2]; $test2 = uc(substr($test2,0,1)).substr($test2,1); #first letter big
#print $title."\t".$category[$i][2]."\t".$category[$j][2]."\n"; if ($test1 eq $test2 and $page_namespace == 0) { error_register($error_code, ''.$category[$i][2].''); #print "\t". $error_code."\t".$title."\t".$category[$i][2]."\n"; } } } }
sub error_018_category_first_letter_small{ my $error_code = 18; my $comment = $_[0]; $error_description[$error_code][0] = 0; $error_description[$error_code][1] = 'Category first letter small'; $error_description[$error_code][2] = 'The first letter of the category is small. It should be a big letter. If a user would scan a dump and he use the category then he will be very happy if all categories begin with a big letter.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($project ne 'commonswiki') { for (my $i = 0; $i <= $category_counter-1; $i++) { my $test_letter = substr($category[$i][2],0,1); if ( $test_letter =~ /([a-z]|ä|ö|ü)/ ) { error_register($error_code, ''.$category[$i][2].''); #print "\t".$test_letter.' - '.$category[$i][2]."\n"; } } } }
sub error_019_headline_only_one{ my $error_code = 19; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Headlines start with one "="'; $error_description[$error_code][2] = 'The first headline start with "= XY =". It should only "== XY ==".'; print $error_code."\n" if ($details_for_page eq 'yes');
if ( $headlines[0] and $page_namespace == 0){ if ( $headlines[0] =~ /^=[^=]/){ error_register($error_code, ''.$headlines[0].''); #print "\t". $error_code."\t".$title."\t".''.$headlines[0].''."\n"; } } }
sub error_020_symbol_for_dead{
my $error_code = 20;
$error_description[$error_code][0] = 3;
$error_description[$error_code][1] = 'Symbol for dead';
$error_description[$error_code][2] = 'The article had a † and not †.';
print $error_code."\n" if ($details_for_page eq 'yes');
my $pos = index ($text, '†'); if ( $pos > -1 and $page_namespace == 0){ my $test_text = substr ($text, $pos, 100); $test_text = text_reduce($test_text, 50); error_register($error_code, '…'.$test_text.'…'); #print "\t". $error_code."\t".$title."\t".'…'.$test_text.'…'."\n"; } }
sub error_021_category_is_english{ my $error_code = 21; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'Category is english'; $error_description[$error_code][2] = 'The article had a category in english. It should renamed in "'.$namespace_cat[0].':ABC…". It is ok for the mediawiki software, but a new wikipedian maybe have a problem with the english language.'; $error_description[$error_code][2] = infotext_new_error( $error_description[$error_code][2] ); print $error_code."\n" if ($details_for_page eq 'yes');
if ( $project ne 'enwiki' and $project ne 'commonswiki' and $page_namespace == 0 and $namespace_cat[0] ne 'Category') { for (my $i=0; $i <= $category_counter; $i++) { my $current_cat = lc ($category[$i][4]);
if ( index ( $current_cat, lc($namespace_cat[1])) > -1 ) { error_register($error_code, ''.$current_cat.''); #print "\t". $error_code."\t".$title."\t".''.$category[$i][4].''."\n"; } } } }
sub error_022_category_with_space{ my $error_code = 22; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'Category with space'; $error_description[$error_code][2] = 'The article had a category a space in front (for example: [[ Category:ABC]] or [[Category : ABC]] ). The mediawiki has no problem with this, but but if you write a external parser this it only one of your problem. Please fix it.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($page_namespace == 0 or $page_namespace == 6) { for (my $i=0; $i <= $category_counter; $i++) { #print "\t". $category[$i][4]. "\n"; if ( $category[$i][4] =~ /\[\[ / or $category[$i][4] =~ /\[\[[^:]+ :/ #or $category[$i][4] =~ /\[\[[^:]+: / ) { error_register($error_code, ''.$category[$i][4].''); #print "\t". $error_code."\t".$title."\t".''.$category[$i][4].''."\n"; } } } }
sub error_023_nowiki_no_correct_end{ my $error_code = 23; my $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Nowiki not correct end'; $error_description[$error_code][2] = 'Found no nowiki end.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne and ( $page_namespace == 0 or $page_namespace == 6 ) ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_024_pre_no_correct_end{ my $error_code = 24; my $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Pre not correct end'; $error_description[$error_code][2] = 'Found no pre end.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne and ( $page_namespace == 0 or $page_namespace == 6 ) ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_025_headline_hierarchy{ my $error_code = 25; my $comment = $_[0]; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'Headline hierarchy'; $error_description[$error_code][2] = 'After a headline of level 1 (==) should not be a headline of level 3 (====). (See also W3C Techniques for WCAG 2.0)'; print $error_code."\n" if ($details_for_page eq 'yes');
my $number_headline = -1; my $old_headline = ; my $new_headline = ; if ( $page_namespace == 0){ foreach (@headlines) { $number_headline = $number_headline +1; $old_headline = $new_headline; $new_headline = $_;
if ($number_headline > 0) { my $level_old = $old_headline; my $level_new = $new_headline;
#print $old_headline."\n"; #print $new_headline."\n"; $level_old =~ s/^([=]+)//; $level_new =~ s/^([=]+)//; $level_old = length($old_headline) - length($level_old); $level_new = length($new_headline) - length($level_new); #print $level_old ."\n"; #print $level_new ."\n";
if ( $level_new > $level_old and ($level_new - $level_old) >1 ){
error_register($error_code, ''.$old_headline.'
'.$new_headline.'');
#print "\t". $error_code."\t".$title."\t".''.$headlines[0].''."\n";
}
}
}
}
}
sub error_026_html_text_style_elements{ my $error_code = 26; $error_description[$error_code][0] = 3; $error_description[$error_code][1] = 'HTML text style element <b>'; $error_description[$error_code][2] = 'Article contains a <b>. '. "In most cases we can use simpler wiki markups in place of these HTML-like tags."; print $error_code."\n" if ($details_for_page eq 'yes');
my $test = 'no found'; my $test_line = ; my $test_text = lc($text); if (index($test_text, '') > -1) { foreach (@lines) { my $current_line = $_; my $current_line_lc = lc($current_line);
if ( $page_namespace == 0 and ( index( $current_line_lc, '') > -1 )) { $test = 'found'; $test_line = $current_line if ($test_line eq ); } } }
if ($test eq 'found' ) { $test_line = text_reduce($test_line, 80); $test_line = $test_line.'…'; error_register($error_code, ''.$test_line.' '); #print "\t". $error_code."\t".$title."\t".$test_line."\n"; } }
sub error_027_unicode_syntax{
my $error_code = 27;
$error_description[$error_code][0] = 3;
$error_description[$error_code][1] = 'Unicode syntax';
$error_description[$error_code][2] = 'Find � (decimal) or &#x
0000; (hexadecimal). Please use the Unicode characters.';
print $error_code."\n" if ($details_for_page eq 'yes');
$error_description[$error_code][2] = infotext_change_error( $error_description[$error_code][2] );
$error_description[$error_code][2] = infotext_new_error( $error_description[$error_code][2] );
if ($page_namespace == 0 or $page_namespace == 6) {
my $pos = -1;
$pos = index( $text, 'ł') if ($pos == -1); # l in Wrozlaw
$pos = index( $text, 'Ĥ') if ($pos == -1); # l in Wrozlaw
$pos = index( $text, '–') if ($pos == -1); # –
#$pos = index( $text, '&#x') if ($pos == -1);
#$pos = index( $text, '&#') if ($pos == -1);
if ($pos > -1) { my $found_text = substr ( $text , $pos); $found_text = text_reduce($found_text, 80); error_register($error_code, ''.$found_text.''); #print "\t". $error_code."\t".$title."\t".$found_text."\n"; } } }
sub error_028_table_no_correct_end{ my $error_code = 28; my $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Table not correct end'; $error_description[$error_code][2] = 'Found no end of the table.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne and $page_namespace == 0 and index ($text, 'Szablon:End') == -1 and index ($text, 'Szablon:End box') == -1 and index ($text, 'Szablon:End box') == -1 ) { error_register($error_code, ' '.$comment.'… '); #print "\t". $error_code."\t".$title."\n"; } }
sub error_029_gallery_no_correct_end{
my $error_code = 29;
my $comment = $_[0];
$error_description[$error_code][0] = 1;
$error_description[$error_code][1] = 'Gallery not correct end';
$error_description[$error_code][2] = 'Found no end of the gallery.';
print $error_code."\n" if ($details_for_page eq 'yes');
if ($comment ne and ($page_namespace == 0 or $page_namespace == 6) ) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\n"; } }
sub error_030_image_without_description { my $error_code = 30; my $comment = $_[0]; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Image without description'; $error_description[$error_code][2] = 'The article has an image without a description. In order to provide good accessibility for everyone (e.g. blind people) a description for every image is needed. (See also W3C Techniques for WCAG 2.0) '; print $error_code."\n" if ($details_for_page eq 'yes'); if ($comment ne ) { if ($page_namespace == 0 or $page_namespace == 6) { error_register($error_code, ''.$comment.''); #print "\t". $error_code."\t".$title."\t".$comment."\n"; } } }
sub old_error_030_image_without_description { my $error_code = 30; $error_description[$error_code][0] = 1; $error_description[$error_code][1] = 'Image without description'; $error_description[$error_code][2] = 'The article has an image without a description. In order to provide good accessibility for everyone (e.g. blind people) a description for every image is needed.'; print $error_code."\n" if ($details_for_page eq 'yes');
if ( $page_namespace == 0 ) {
my $test_text = lc($text); my $found_image_in_text = 0; foreach (@namespace_image) { my $namespace_image_word = $_; $found_image_in_text = $found_image_in_text +1 if ( $test_text =~ /$namespace_image_word:/i); } if ($found_image_in_text > 0) {
foreach (@lines) { my $current_line = $_;
my $found = 0;
my $found_image = 0;
foreach (@namespace_image) {
my $namespace_image_word = $_;
$found_image = $found_image +1 if ( $current_line =~ /$namespace_image_word:/i );
}
if ($found_image > 0) { foreach (@namespace_image) { my $namespace_image_word = $_;
# < $found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:([ ]+)?[^\|]+([ ]+)?\]\]/ig);
# < $found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?\]\]/ig);
# $found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\]\]/ig); # $found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\|([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\|([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\|([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\]\]/ig);
#
$found = $found +1 if ( $current_line =~ /^\[\[([ ]+)?$namespace_image_word:[^ ]+([ ]+)?\|([ ]+)?[1-9][0-9][0-9]px([ ]+)?\|([ ]+)?(thumb|left|right|frame)([ ]+)?\|([ ]+)?\]\]/ig); } }
if ($found > 0) { my $output = substr ( $current_line ,0, index( $current_line, ']]') +2 ); error_register($error_code, ''.$output.''); #print $title."\n"; #print $output."\n"; #die; } } } }
}
sub error_031_html_table_elements{ my $error_code = 31; $error_description[$error_code][0] = 2; $error_description[$error_code][1] = 'HTML table element'; $error_description[$error_code][2] = 'Article contains a "<table>", "<td>", "<th>" or "<tr>". '. "In most cases we can use simpler wiki markups in place of these HTML-like tags."; print $error_code."\n" if ($details_for_page eq 'yes');
my $test = 'no found'; my $test_line = ; my $test_text = lc($text); if ($page_namespace == 0 or $page_namespace == 6) { if (index($test_text, '<t') > -1) { foreach (@lines) { my $current_line = $_; my $current_line_lc = lc($current_line);
if ( $page_namespace == 0 and (
#index( $current_line_lc, '
') > -1 #or index( $current_line_lc, ' | ') > -1 #or index( $current_line_lc, ' |
---|
- A reference with bold text