User:PeerInfinity/Scripts/SyncArticleLinks.php
< User:PeerInfinity | Scripts
Jump to navigation
Jump to search
<?php
// SyncArticleLinks.php // synchronize the list of article links on the concept pages with the list of concepts on the "All Articles" pages
// for the latest version of this script's output, see:
///stderr output - the stderr output of this script
///SyncArticleLinksOutput.txt - the output written to the text file, containing the Sync results
//
$stderr = fopen( "php://stderr", "wt" ); //* $XMLfile = "daily_XML_dump.xml"; // the arrays for the data read from the All Articles pages: // it would be more proper to create a struct for this, but for now I'll just do it the quick and dirty way $NextAllArticleIndex = 0; // these arrays are indexed by $NextAllArticleIndex $ArrayAllArticleTitle = array(); $ArrayAllArticleLink = array(); $ArrayAllArticleIndexedConcepts = array(); // this is an array of arrays! $ArrayAllArticleFoundConcepts = array(); // this is an array of arrays! $ArrayAllArticleNotIndexedConcepts = array(); // this is an array of arrays! $ArrayAllArticleAuthor = array(); $ArrayAllArticleDate = array(); // currently unused, This would need to be read by following the link to the article, which might be a good idea to implement eventually $ArrayAllArticleOfficialSummaries = array(); // this is an array of arrays! //this one is indexed by article name! $ArrayAllArticleUsedSummaries = array(); // this is an array of arrays! // the array for all Concepts found // these arrays are indexed by concept title $ConceptFound = array(); $ConceptNotInIndex = array(); $ConceptThatAreRedirects = array(); $ConceptSeeAlso = array(); // this is an array of arrays! // these arrays are indexed by concept title $PagesWithOvercomingBiasLinks = array(); $PagesWithComments = array(); $PagesWithOvercomingBiasArticlesHeader = array(); $PagesWithExternalReferences = array(); $PagesWithSeeAlso = array(); $PagesWithExternalAuthorLinks = array(); $PagesWithNewlineAfterWikiLink = array(); $PagesWithSeeAlsoBeforeBlogPosts = array(); // ugh... this is ugly: // these arrays are indexed by the index variables below $ArticleLinksWithoutEndingSlash = array(); $ArticleLinksWithWrongTitle = array(); $ArticleLinksWithoutAuthor = array(); $ArticleLinksWithAvailableSummary = array(); $ArticleLinksWithoutEndingSlashConcept = array(); $ArticleLinksWithWrongTitleConcept = array(); $ArticleLinksWithoutAuthorConcept = array(); $ArticleLinksWithAvailableSummaryConcept = array(); $ArticleLinksWithoutEndingSlashNextIndex = 0; $ArticleLinksWithWrongTitleNextIndex = 0; $ArticleLinksWithoutAuthorConceptNextIndex = 0; $ArticleLinksWithAvailableSummaryIndex = 0; $CurrentTitle = ""; $CurrentBody = ""; $IsReadingTitle = false; $IsReadingText = false; $IsReadingAllArticlesPages = false; $IsReadingSummariesPages = false; $IsReadingConceptPages = false; $IsReadingOneAllArticlesPage = false; $IsReadingOneSummariesPage = false; $IsReadingOneConceptPage = false; $FirstYearToRead = 2006; $FinalYearToRead = 2010; //todo - update this in 2011!!! $NextYearToRead = $FirstYearToRead; $YearCurrentlyBeingRead = $FirstYearToRead; $SuccessfullyReadOneAllArticlesPage = false; $SuccessfullyReadOneSummariesPage = false; $FailedToReadAllArticlesPage = false; $SuccessfullyReadOneConceptPage = false; $FailedToReadConceptPage = false; $PagesRead = 0; $Debug = true; // for doxygen $fontsize = 12; // to make the script easier to read, pad each section to a specific number of characters, to make everything line up $PaddingValue1 = 100; $PaddingValue2 = 50; //open a text file for the output of this script //the script will send its output both to this text file and to stdout (or was it stderr?) $fp = fopen('SyncArticleLinksOutput.txt', 'w'); // now for some code that I still haven't figured out a good way to untangle // this processing currently needs to be done in these functions, and can't be moved to a more sensible place function startElement($parser, $name, $attribs) { global $CurrentTitle ; global $CurrentBody ; global $IsReadingTitle ; global $IsReadingText ; global $stderr; // remember what element we're reading, so that we know what to do in characterData() if( $name == "TITLE" ) { $IsReadingTitle = true; } else { $IsReadingTitle = false; } if( $name == "TEXT" ) { $IsReadingText = true; } else { $IsReadingText = false; } } function characterData($parser, $data) { global $stderr; global $XMLfile; global $NextAllArticleIndex; global $ArrayAllArticleTitle ; global $ArrayAllArticleLink ; global $ArrayAllArticleIndexedConcepts ; global $ArrayAllArticleFoundConcepts ; global $ArrayAllArticleNotIndexedConcepts; global $ArrayAllArticleAuthor ; global $ArrayAllArticleDate ; global $ArrayAllArticleOfficialSummaries ; global $ArrayAllArticleUsedSummaries ; global $ConceptFound; global $ConceptNotInIndex; global $ConceptThatAreRedirects; global $ConceptSeeAlso; global $PagesWithOvercomingBiasLinks; global $PagesWithComments; global $PagesWithOvercomingBiasArticlesHeader; global $PagesWithExternalReferences; global $PagesWithSeeAlso; global $PagesWithExternalAuthorLinks; global $PagesWithNewlineAfterWikiLink; global $PagesWithSeeAlsoBeforeBlogPosts; global $ArticleLinksWithoutEndingSlash; global $ArticleLinksWithWrongTitle; global $ArticleLinksWithoutAuthor; global $ArticleLinksWithAvailableSummary; global $ArticleLinksWithoutEndingSlashConcept; global $ArticleLinksWithWrongTitleConcept; global $ArticleLinksWithoutAuthorConcept; global $ArticleLinksWithAvailableSummaryConcept; global $ArticleLinksWithoutEndingSlashNextIndex; global $ArticleLinksWithWrongTitleNextIndex; global $ArticleLinksWithoutAuthorNextIndex; global $ArticleLinksWithAvailableSummaryNextIndex; global $CurrentTitle; global $CurrentBody ; global $IsReadingTitle; global $IsReadingText ; global $IsReadingAllArticlesPages; global $IsReadingSummariesPages ; global $IsReadingConceptPages ; global $IsReadingOneAllArticlesPage; global $IsReadingOneSummariesPage ; global $IsReadingOneConceptPage ; global $FirstYearToRead; global $FinalYearToRead; global $NextYearToRead; global $YearCurrentlyBeingRead; global $SuccessfullyReadOneAllArticlesPage; global $SuccessfullyReadOneSummariesPage ; global $FailedToReadAllArticlesPage ; global $SuccessfullyReadOneConceptPage; global $FailedToReadConceptPage; global $PagesRead; global $Debug; //open a text file for the output of this script //the script will send its output both to this text file and to stdout (or was it stderr?) $fp = fopen('SyncArticleLinksOutput.txt', 'w'); // if we're reading the title, then remember the title, and check if we want to read the page content if( $IsReadingTitle ) { $CurrentBody = ""; $CurrentTitle = $data; $IsReadingTitle = false; $IsReadingOneAllArticlesPage = false; $IsReadingOneSummariesPage = false; $IsReadingOneConceptPage = false; // skip category pages, template pages, etc. if( substr_count($CurrentTitle, "Category:") > 0 ) { } else if( substr_count($CurrentTitle, "Template:") > 0 ) { } else if( substr_count($CurrentTitle, "Talk:") > 0 ) { } else if( substr_count($CurrentTitle, "Category talk:") > 0 ) { } else if ( $CurrentTitle == "Catch Phrases" || $CurrentTitle == "Categories" || $CurrentTitle == "Chat Logs/2009-04-11" || $CurrentTitle == "Disagreements on Less Wrong" || $CurrentTitle == "Series" || $CurrentTitle == "Using the wiki" || $CurrentTitle == "Acronyms used on Less Wrong" || $CurrentTitle == "Less Wrong/Errors from moving Eliezer's posts from OB to LW" || false ) { } else { if( $IsReadingAllArticlesPages && $NextYearToRead <= $FinalYearToRead ) { // check if the page title is the next year we want to read if ( ( $NextYearToRead == 2006 && substr_count($CurrentTitle, "Less Wrong/2006 Articles") > 0 ) || ( $NextYearToRead == 2007 && substr_count($CurrentTitle, "Less Wrong/2007 Articles") > 0 ) || ( $NextYearToRead == 2008 && substr_count($CurrentTitle, "Less Wrong/2008 Articles") > 0 ) || ( $NextYearToRead == 2009 && substr_count($CurrentTitle, "Less Wrong/2009 Articles") > 0 ) || ( $NextYearToRead == 2010 && substr_count($CurrentTitle, "Less Wrong/2010 Articles") > 0 ) //todo - update this in 2011!!! ) { $YearCurrentlyBeingRead = $NextYearToRead; $NextYearToRead++; $IsReadingOneAllArticlesPage = true; fwrite( $stderr, "Processing the wikipage for $YearCurrentlyBeingRead \n\n" ); } } else if( $IsReadingSummariesPages && $NextYearToRead <= $FinalYearToRead ) { // check if the page title is the next year we want to read if ( ( $NextYearToRead == 2006 && substr_count($CurrentTitle, "Less Wrong/2006 Articles/Summaries") > 0 ) || ( $NextYearToRead == 2007 && substr_count($CurrentTitle, "Less Wrong/2007 Articles/Summaries") > 0 ) || ( $NextYearToRead == 2008 && substr_count($CurrentTitle, "Less Wrong/2008 Articles/Summaries") > 0 ) || ( $NextYearToRead == 2009 && substr_count($CurrentTitle, "Less Wrong/2009 Articles/Summaries") > 0 ) || ( $NextYearToRead == 2010 && substr_count($CurrentTitle, "Less Wrong/2010 Articles/Summaries") > 0 ) //todo - update this in 2011!!! ) { $YearCurrentlyBeingRead = $NextYearToRead; $NextYearToRead++; $IsReadingOneSummariesPage = true; fwrite( $stderr, "Processing the summaries page for $YearCurrentlyBeingRead \n\n" ); } } else if( $IsReadingConceptPages ) { // don't process the All Articles pages! if ( substr_count($CurrentTitle, "Less Wrong/2006 Articles") <= 0 && substr_count($CurrentTitle, "Less Wrong/2007 Articles") <= 0 && substr_count($CurrentTitle, "Less Wrong/2008 Articles") <= 0 && substr_count($CurrentTitle, "Less Wrong/2009 Articles") <= 0 && substr_count($CurrentTitle, "Less Wrong/2010 Articles") <= 0 //todo - update this in 2011!!! ) { // no special processing here, just remember the title $IsReadingOneConceptPage = true; if( $PagesRead % 100 == 0 ) { fwrite( $stderr, "Pages read: $PagesRead\n" ); } //fwrite( $stderr, "Found the article $CurrentTitle \n\n" ); $PagesRead++; } } else if( $IsReadingAllArticlesPages ) { //fwrite( $stderr, "reading All Articles pages, past the final year\n\n" ); } else if( $IsReadingSummariesPages ) { //fwrite( $stderr, "reading summary pages, past the final year\n\n" ); } else { fwrite( $stderr, "Error: not reading all articles, summaries, or concept pages\n\n" ); } } } // if we're reading the text, then store the content of the article // we'll process the data in endElement() // there is probably a more efficient way to do this if( $IsReadingText ) { if( $IsReadingOneAllArticlesPage ) { $CurrentBody .= $data; } if( $IsReadingOneSummariesPage ) { $CurrentBody .= $data; } if( $IsReadingOneConceptPage ) { $CurrentBody .= $data; } } } function endElement($parser, $name) { global $stderr; global $XMLfile; global $NextAllArticleIndex; global $ArrayAllArticleTitle ; global $ArrayAllArticleLink ; global $ArrayAllArticleIndexedConcepts ; global $ArrayAllArticleFoundConcepts ; global $ArrayAllArticleNotIndexedConcepts; global $ArrayAllArticleAuthor ; global $ArrayAllArticleDate ; global $ArrayAllArticleOfficialSummaries ; global $ArrayAllArticleUsedSummaries ; global $ConceptFound; global $ConceptNotInIndex; global $ConceptThatAreRedirects; global $ConceptSeeAlso; global $PagesWithOvercomingBiasLinks; global $PagesWithComments; global $PagesWithOvercomingBiasArticlesHeader; global $PagesWithExternalReferences; global $PagesWithSeeAlso; global $PagesWithExternalAuthorLinks; global $PagesWithNewlineAfterWikiLink; global $PagesWithSeeAlsoBeforeBlogPosts; global $ArticleLinksWithoutEndingSlash; global $ArticleLinksWithWrongTitle; global $ArticleLinksWithoutAuthor; global $ArticleLinksWithAvailableSummary; global $ArticleLinksWithoutEndingSlashConcept; global $ArticleLinksWithWrongTitleConcept; global $ArticleLinksWithoutAuthorConcept; global $ArticleLinksWithAvailableSummaryConcept; global $ArticleLinksWithoutEndingSlashNextIndex; global $ArticleLinksWithWrongTitleNextIndex; global $ArticleLinksWithoutAuthorNextIndex; global $ArticleLinksWithAvailableSummaryNextIndex; global $CurrentTitle; global $CurrentBody ; global $IsReadingTitle; global $IsReadingText ; global $IsReadingAllArticlesPages; global $IsReadingSummariesPages ; global $IsReadingConceptPages ; global $IsReadingOneAllArticlesPage; global $IsReadingOneSummariesPage ; global $IsReadingOneConceptPage ; global $FirstYearToRead; global $FinalYearToRead; global $NextYearToRead; global $YearCurrentlyBeingRead; global $SuccessfullyReadOneAllArticlesPage; global $SuccessfullyReadOneSummariesPage ; global $FailedToReadAllArticlesPage ; global $SuccessfullyReadOneConceptPage; global $FailedToReadConceptPage; global $PagesRead; global $Debug; // now process all the text that we read if( $name == "TEXT" ) { // this check is probably unnecessary if( $IsReadingText ) { if( $IsReadingOneAllArticlesPage ) { //fwrite( $stderr, "-------------------------------------------reading body of year $YearCurrentlyBeingRead , $CurrentTitle \n\n" ); //fwrite( $stderr, $CurrentBody ); // now parse the data from this page and store it into the arrays $CurrentGroupStartPos = 0; // skip to the first '|-' $NextGroupStartPos = strpos( $CurrentBody, "|-", $CurrentGroupStartPos+1 ); // keep going until there are no more groups while( $NextGroupStartPos !== FALSE && $CurrentGroupStartPos != $NextGroupStartPos ) { // find the start and end of teh current group $CurrentGroupStartPos = $NextGroupStartPos; $NextGroupStartPos = strpos( $CurrentBody, "|-", $CurrentGroupStartPos+1 ); // if there is no next group, then set the end of this group to the end of the whole string if( $NextGroupStartPos === FALSE ) { $NextGroupStartPos = strlen( $CurrentBody ) - 1; } // now find and store the parts // there is probably a much simpler way to do all this using regexes $DataIsValid = true; $CurrentArticleConceptArray = array(); $TokenStartPos = strpos($CurrentBody, "[", $CurrentGroupStartPos) + 1; $TokenEndPos = strpos($CurrentBody, " ", $TokenStartPos); $CurrentArticleLink = substr( $CurrentBody, $TokenStartPos, $TokenEndPos-$TokenStartPos ); if( substr_count( $CurrentArticleLink, "http://" ) <= 0 ) { $DataIsValid = false; } $TokenStartPos = $TokenEndPos + 1; $TokenEndPos = strpos($CurrentBody, "]", $TokenStartPos); $CurrentArticleTitle = substr( $CurrentBody, $TokenStartPos, $TokenEndPos-$TokenStartPos ); $CurrentArticleTitle = str_replace(""", "\"", $CurrentArticleTitle); $NextLineStartPos = strpos($CurrentBody, "|", $TokenEndPos); $NextLineEndPos = strpos($CurrentBody, "|", $NextLineStartPos+1); $ConceptLine = substr( $CurrentBody, $NextLineStartPos, $NextLineEndPos-$NextLineStartPos ); //fwrite( $stderr, "concept line start: $NextLineStartPos \n concept line end: $NextLineEndPos \n concept line: $ConceptLine\n\n" ); $FinishedFindingConcepts = false; $ConceptIndex = 0; $TokenStartPos = 0; $TokenEndPos = 0; while( !$FinishedFindingConcepts ) { $TokenStartPos = strpos($ConceptLine, "[[", $TokenEndPos); $TokenEndPos = strpos($ConceptLine, "]]", $TokenStartPos); $CurrentConcept = substr( $ConceptLine, $TokenStartPos+2, $TokenEndPos-$TokenStartPos-2 ); if ( $TokenEndPos > $TokenStartPos && substr_count($CurrentConcept, "[[") <= 0 && substr_count($CurrentConcept, "]]") <= 0 ) { //fwrite( $stderr, "found concept: $CurrentConcept\n\n" ); $CurrentArticleConceptArray[$ConceptIndex] = $CurrentConcept; $ConceptIndex++; $FinishedFindingConcepts = false; } else { $FinishedFindingConcepts = true; } //fwrite( $stderr, "concept start: $TokenStartPos \n concept end: $TokenEndPos \n concept index: $ConceptIndex \n concept line: $ConceptLine\n\n" ); } $NextLineStartPos = strpos($CurrentBody, "|", $NextLineEndPos); $NextLineEndPos = strpos($CurrentBody, "|", $NextLineStartPos+1); $TokenStartPos = strpos($CurrentBody, "[", $NextLineStartPos); $TokenEndPos = strpos($CurrentBody, "]", $TokenStartPos); $CurrentArticleAuthor = substr( $CurrentBody, $TokenStartPos, $TokenEndPos-$TokenStartPos+1 ); if( $DataIsValid ) { $ArrayAllArticleTitle [$NextAllArticleIndex] = $CurrentArticleTitle; $ArrayAllArticleLink [$NextAllArticleIndex] = $CurrentArticleLink; //$CurrentArticleConceptArray[] = "concept1"; $ArrayAllArticleIndexedConcepts [$NextAllArticleIndex] = $CurrentArticleConceptArray; $ArrayAllArticleFoundConcepts [$NextAllArticleIndex] = array(); $ArrayAllArticleNotIndexedConcepts [$NextAllArticleIndex] = array(); $ArrayAllArticleAuthor [$NextAllArticleIndex] = $CurrentArticleAuthor; $ArrayAllArticleDate [$NextAllArticleIndex] = "date"; $ArrayAllArticleUsedSummaries [$NextAllArticleIndex] = array(); //$TempString = print_r($ArrayAllArticleIndexedConcepts[$NextAllArticleIndex]); //fwrite( $stderr, "start: $CurrentGroupStartPos \n end: $NextGroupStartPos \n index: $NextAllArticleIndex \n $ArrayAllArticleTitle[$NextAllArticleIndex] \n $ArrayAllArticleLink[$NextAllArticleIndex] \n $TempString \n $ArrayAllArticleAuthor[$NextAllArticleIndex] \n $ArrayAllArticleDate[$NextAllArticleIndex] \n\n" ); $NextAllArticleIndex++; } } $SuccessfullyReadOneAllArticlesPage = true; } if( $IsReadingOneSummariesPage ) { //fwrite( $stderr, "reading summaries page for $YearCurrentlyBeingRead \n\n" ); $NextSummaryTitleStart = 0; $NextSummaryTitleEnd = 0; $NextSummaryTextBlockStart = 0; $NextSummaryTextBlockEnd = 0; $NextSummaryTextStart = 0; $NextSummaryTextEnd = 0; $AlternateSummaryCount = 0; $FinishedReadingSummaries = false; while( !$FinishedReadingSummaries ) { $NextSummaryTitleStart = strpos($CurrentBody, "=====[", $NextSummaryTitleEnd); //fwrite( $stderr, "CurrentBody: $CurrentBody \n\n" ); //fwrite( $stderr, "NextSummaryTitleStart: $NextSummaryTitleStart \n\n" ); if( $NextSummaryTitleStart === FALSE ) { $FinishedReadingSummaries = true; } else { $NextSummaryTitleStart += 6; $NextSummaryTitleEnd = strpos($CurrentBody, "]=====", $NextSummaryTitleStart); $FirstSpacePos = strpos($CurrentBody, " ", $NextSummaryTitleStart); $CurrentSummaryLink = substr( $CurrentBody, $NextSummaryTitleStart, $FirstSpacePos-$NextSummaryTitleStart ); $CurrentSummaryTitle = substr( $CurrentBody, $FirstSpacePos+1, $NextSummaryTitleEnd-$FirstSpacePos-1 ); $CurrentSummaryTitle = str_replace(""", "\"", $CurrentSummaryTitle); //fwrite( $stderr, "reading summary of $CurrentSummaryTitle \n\n" ); $NextSummaryTextBlockStart = $NextSummaryTitleEnd + 8; $NextSummaryTextBlockEnd = strpos($CurrentBody, "=====[", $NextSummaryTitleEnd); if( $NextSummaryTextBlockEnd === FALSE ) { $NextSummaryTextBlockEnd = strlen($CurrentBody); } $NextSummaryTextBlock = substr( $CurrentBody, $NextSummaryTextBlockStart, $NextSummaryTextBlockEnd-$NextSummaryTextBlockStart ); $AlternateSummaryCount = substr_count( $NextSummaryTextBlock, "(alternate summary:)" ); $NextSummaryTextEnd = 0; $ArrayAllArticleOfficialSummaries[$CurrentSummaryTitle] = array(); for( $AlternateSummaryNum = 0; $AlternateSummaryNum < $AlternateSummaryCount; $AlternateSummaryNum++ ) { $NextSummaryTextStart = $NextSummaryTextEnd; $NextSummaryTextEnd = strpos( $NextSummaryTextBlock, "\n\n(alternate summary:)\n\n", $NextSummaryTextStart ); $SummaryToAdd = substr( $NextSummaryTextBlock, $NextSummaryTextStart, $NextSummaryTextEnd-$NextSummaryTextStart ); $ArrayAllArticleOfficialSummaries[$CurrentSummaryTitle][$AlternateSummaryNum] = $SummaryToAdd; $NextSummaryTextEnd += strlen( "\n\n(alternate summary:)\n\n" ); } $NextSummaryTextStart = $NextSummaryTextEnd; $SummaryToAdd = substr( $NextSummaryTextBlock, $NextSummaryTextStart ); $SummaryToAdd = trim( $SummaryToAdd, "\n" ); if( strlen( $SummaryToAdd ) > 0 ) { if( strpos( $SummaryToAdd, "__NOTOC__" ) !== FALSE ) { //todo - decide how to deal with this case! } else { $ArrayAllArticleOfficialSummaries[$CurrentSummaryTitle][$AlternateSummaryCount] = $SummaryToAdd; } } } } $SuccessfullyReadOneSummariesPage = true; } if( $IsReadingOneConceptPage ) { //fwrite( $stderr, "------reading body of $CurrentTitle \n\n" ); // don't bother processing the page unless it has a "Blog posts" section $TokenStartPos = 0; $TokenEndPos = 0; // keep track of which pages have links to OvercomingBias.com articles if( substr_count( $CurrentBody, "http://www.overcomingbias.com/20" ) > 0 ) { // the following pages are "allowed" to have OvercomingBias.com articles - these pages were already checked manually for links that should point to lesswrong.com instead if ( $CurrentTitle == "Aumann's agreement theorem" || $CurrentTitle == "Bias" || $CurrentTitle == "Bite the bullet" || $CurrentTitle == "Black swan" || $CurrentTitle == "Catch Phrases" || $CurrentTitle == "Chat Logs/2009-04-11" || $CurrentTitle == "Cognitive style" || $CurrentTitle == "Coherence" || $CurrentTitle == "Connotation" || $CurrentTitle == "Consistency" || $CurrentTitle == "Cryonics" || $CurrentTitle == "Dark arts" || $CurrentTitle == "Disagreement" || $CurrentTitle == "Disagreements on Less Wrong" || $CurrentTitle == "Emotion" || $CurrentTitle == "Extraordinary evidence" || $CurrentTitle == "Forecast" || $CurrentTitle == "Hypocrisy" || $CurrentTitle == "Impossible world" || $CurrentTitle == "Intellectual roles" || $CurrentTitle == "Likelihood ratio" || $CurrentTitle == "Meme lineage" || $CurrentTitle == "Modesty argument" || $CurrentTitle == "Near/far thinking" || $CurrentTitle == "Overcoming Bias" || $CurrentTitle == "Overconfidence" || $CurrentTitle == "Prediction market" || $CurrentTitle == "Scales of justice fallacy" || $CurrentTitle == "Series" || $CurrentTitle == "Signaling" || $CurrentTitle == "Signalling" || $CurrentTitle == "Status" || $CurrentTitle == "Stereotype" || false ) { // do nothing } else { $PagesWithOvercomingBiasLinks[$CurrentTitle] = true; } } if( substr_count( $CurrentBody, "<!--" ) > 0 ) { // the following pages are "allowed" to have comments if ( $CurrentTitle == "LessWrong Wiki" ) { // do nothing } else { $PagesWithComments[$CurrentTitle] = true; } } if( substr_count( $CurrentBody, "==Overcoming Bias Articles==" ) > 0 ) { $PagesWithOvercomingBiasArticlesHeader[$CurrentTitle] = true; } if( substr_count( $CurrentBody, "External references" ) > 0 ) { $PagesWithExternalReferences[$CurrentTitle] = true; } if( substr_count( $CurrentBody, "See Also" ) > 0 ) { $PagesWithSeeAlso[$CurrentTitle] = true; } if( substr_count( $CurrentBody, "by [http" ) > 0 ) { $PagesWithExternalAuthorLinks[$CurrentTitle] = true; } if( substr_count( $CurrentBody, "wikilink}}\n\n" ) > 0 ) { $PagesWithNewlineAfterWikiLink[$CurrentTitle] = true; } $SeeAlsoPos = strpos($CurrentBody, "==See also=="); $BlogPostsPos = strpos($CurrentBody, "==Blog posts=="); if( $SeeAlsoPos !== FALSE && $BlogPostsPos !== FALSE ) { if( $SeeAlsoPos < $BlogPostsPos ) { $PagesWithSeeAlsoBeforeBlogPosts[$CurrentTitle] = true; } } if( substr_count( $CurrentBody, "#REDIRECT" ) > 0 ) { if( array_key_exists( $CurrentTitle, $ConceptFound ) ) { $ConceptThatAreRedirects[$CurrentTitle] = true; } } //todo - reconsider this!!! //if( substr_count( $CurrentArticleLink, "==Blog posts==" ) > 0 ) if( substr_count( $CurrentBody, "http://lesswrong.com/lw/" ) > 0 ) { // first check if the concept exists in the index $ConceptIsInIndex = false; if( array_key_exists( $CurrentTitle, $ConceptFound ) ) { $ConceptIsInIndex = true; } if( $ConceptIsInIndex ) { // if the concept exists in the index, then mark the concept as found $ConceptFound[$CurrentTitle] = true; //fwrite( $stderr, "++++++found: $CurrentTitle\n\n" ); } else { // if the concept doesn't exist in the index, then mark the concept as not found $ConceptNotInIndex[$CurrentTitle] = true; } // now keep track of the "See also" links if( array_key_exists($CurrentTitle, $ConceptFound ) ) { if( $ConceptFound[$CurrentTitle] == true ) { $SeeAlsoStartPos = strpos($CurrentBody, "==See also==", 0); if( $SeeAlsoStartPos !== FALSE ) { $SeeAlsoStartPos += 12; } else { $SeeAlsoStartPos = strpos($CurrentBody, "==Related concepts==", 0); if( $SeeAlsoStartPos !== FALSE ) { $SeeAlsoStartPos += 20; } else { } } if( $SeeAlsoStartPos !== FALSE ) { $SeeAlsoEndPos = strpos($CurrentBody, "==", $SeeAlsoStartPos); if( $SeeAlsoEndPos === FALSE ) { $SeeAlsoEndPos = strlen($CurrentBody); } $TokenStartPos = $SeeAlsoStartPos; $TokenEndPos = $SeeAlsoStartPos; $DoneSeeAlso = false; $EntriesFound = 0; $ConceptSeeAlso[$CurrentTitle] = array(); //fwrite( $stderr, "Concept: " . $CurrentTitle . "\n" ); while( !$DoneSeeAlso ) { $TokenStartPos = strpos($CurrentBody, "[[", $TokenEndPos); $TokenEndPos = strpos($CurrentBody, "]]", $TokenStartPos); if ( $TokenStartPos < $SeeAlsoEndPos && $TokenStartPos >= $SeeAlsoStartPos && $TokenStartPos !== FALSE && $TokenEndPos !== FALSE ) { $CurrentSeeAlso = substr( $CurrentBody, $TokenStartPos+2, $TokenEndPos-$TokenStartPos-2 ); if( substr_count( $CurrentSeeAlso, "Category:") <= 0 ) { $ConceptSeeAlso[$CurrentTitle][$EntriesFound] = $CurrentSeeAlso; //fwrite( $stderr, $ConceptSeeAlso[$CurrentTitle][$EntriesFound] . "\n" ); $EntriesFound++; } } else { $DoneSeeAlso = true; } //fwrite( $stderr, "SeeAlsoStartPos: $SeeAlsoStartPos SeeAlsoEndPos: $SeeAlsoEndPos TokenStartPos: $TokenStartPos TokenEndPos $TokenEndPos" . "\n" ); } } } } $TokenStartPos = strpos($CurrentBody, "http://lesswrong.com/lw/", $TokenEndPos); $TokenEndPos = strpos($CurrentBody, " ", $TokenStartPos); $NextTokenStartPos = strpos($CurrentBody, "http://lesswrong.com/lw/", $TokenEndPos); if( $NextTokenStartPos === FALSE ) { $NextTokenStartPos = strlen($CurrentBody); } $CurrentLink = substr( $CurrentBody, $TokenStartPos, $TokenEndPos-$TokenStartPos ); $FinishedFindingLinks = false; while( !$FinishedFindingLinks ) { // check if the link appears before or after the first header $FirstHeaderPos = strpos($CurrentBody, "==", 0); $LinkIsAfterHeader = false; if( $FirstHeaderPos === FALSE ) { $LinkIsAfterHeader = false; } else { if( $FirstHeaderPos < $TokenStartPos ) { $LinkIsAfterHeader = true; } else { $LinkIsAfterHeader = false; } } // ignore the following links: if ( substr_count( $CurrentLink, "http://lesswrong.com/lw/b1/persuasiveness_vs_soundness/789") <= 0 && substr_count( $CurrentLink, "http://lesswrong.com/lw/14v/the_usefulness_of_correlations/11iu") <= 0 ) { // check if the link ends with a / $LastCharInLink = substr( $CurrentLink, -1 ); $SlashCountBeforeAdd = substr_count( $CurrentLink, "/"); if ( strcmp( $LastCharInLink, "/" ) != 0 ) { $CurrentLink .= "/"; } // check if the link's title is correct $TitleStartPos = $TokenEndPos+1; $TitleEndPos = strpos($CurrentBody, "]", $TitleStartPos); $CurrentArticleTitle = substr( $CurrentBody, $TitleStartPos, $TitleEndPos-$TitleStartPos ); $CorrectArticleTitle = "(not found)"; $CorrectArticleAuthor = "(not found)"; $SearchResult = array_search( $CurrentLink, $ArrayAllArticleLink ); if( $SearchResult !== FALSE ) { $CorrectArticleTitle = $ArrayAllArticleTitle [$SearchResult]; $CorrectArticleAuthor = $ArrayAllArticleAuthor[$SearchResult]; //don't do any processing for summaries unless the link we're looking at is after the first header if( $LinkIsAfterHeader ) { $LinkEndPos = strpos($CurrentBody, "]", $TokenStartPos) + 1; $NewLinePos = strpos($CurrentBody, "\n", $LinkEndPos); $NextArticleSummary = ""; if( $NewLinePos === FALSE ) { $NextArticleSummary = substr( $CurrentBody, $LinkEndPos ); } else { //$NextArticleSummary .= "\nsummary:\n" + substr( $CurrentBody, $TokenStartPos, $NextTokenStartPos-$TokenStartPos ); $NextArticleSummary = substr( $CurrentBody, $LinkEndPos, $NewLinePos-$LinkEndPos ); } // search through the array of official summaries for this article, to see if any of them are found in the current article // if a match is found, then don't add the summary to the array of used summaries $SummaryExistsForThisArticle = false; $FoundAMatchingSummary = false; $FirstFoundSummary = ""; if( array_key_exists( $CorrectArticleTitle, $ArrayAllArticleOfficialSummaries ) ) { $SummaryExistsForThisArticle = true; foreach( $ArrayAllArticleOfficialSummaries[$CorrectArticleTitle] as $key => $val ) { if( substr_count( $CurrentBody, $val ) > 0 ) { if( !$FoundAMatchingSummary ) { $FirstFoundSummary = $val; } $FoundAMatchingSummary = true; } } } $AddThisSummaryToTheArray = false; if( strpos( $NextArticleSummary, "by [[" ) > 0 ) { //todo - consider adding a check for improperly formatted author links //todo - consider adding a check for an author link followed by a summary //todo - consider adding a check for a summary that doesn't appear until after the newline } else if( strlen( $NextArticleSummary ) > 0 ) { if( $FoundAMatchingSummary ) { // if we found a matching summary, then we don't need to do anything here } else { $AddThisSummaryToTheArray = true; } } else if( $SummaryExistsForThisArticle ) { // a summary is available for this article, but the summary isn't on this page. // report this to //$ArticleLinksWithAvailableSummary[$ArticleLinksWithAvailableSummaryNextIndex] = "*[$CurrentLink $CorrectArticleTitle] - " . $ArrayAllArticleOfficialSummaries[$CorrectArticleTitle][0]; //$ArticleLinksWithAvailableSummary[$ArticleLinksWithAvailableSummaryNextIndex] = "*[$CurrentLink $CorrectArticleTitle] - " . $FirstFoundSummary; // just accessing the array directly doesn't work, and neither did that $FirstFoundSummary trick. for some reason, I need to use a foreach //fwrite( $stderr, "$FirstFoundSummary\n" ); //fwrite( $stderr, "$ArrayAllArticleOfficialSummaries[$CorrectArticleTitle][0]\n" ); $IsFirstIteration = true; foreach( $ArrayAllArticleOfficialSummaries[$CorrectArticleTitle] as $key => $val ) { if( $IsFirstIteration ) { $ArticleLinksWithAvailableSummary[$ArticleLinksWithAvailableSummaryNextIndex] = "*[$CurrentLink $CorrectArticleTitle] - " . $val; $IsFirstIteration = false; } //fwrite( $stderr, "$val\n" ); } $ArticleLinksWithAvailableSummaryConcept[$ArticleLinksWithAvailableSummaryNextIndex] = $CurrentTitle; $ArticleLinksWithAvailableSummaryNextIndex++; } if( $AddThisSummaryToTheArray ) { // ignore known invalid summaries if( strlen( $NextArticleSummary ) > 10 ) { $NextArticleSummary = trim( $NextArticleSummary, " ,-—'" ); if ( substr_count( $NextArticleSummary, "'' and ''[http://lesswrong.com/lw/hm/new_improved_lottery/ New Improved Lottery]''" ) > 0 || substr_count( $NextArticleSummary, "(and [http://lesswrong.com/lw/ht/beware_the_unsurprised/ Beware the Unsurprised])" ) > 0 || substr_count( $NextArticleSummary, "In particular, the [[Litany of Tarski]]." ) > 0 || substr_count( $NextArticleSummary, "(but first read: [http://lesswrong.com/lw/m2/the_litany_against_gurus/ The Litany Against Gurus])" ) > 0 || substr_count( $NextArticleSummary, ", [http://lesswrong.com/lw/" ) > 0 || substr_count( $NextArticleSummary, "and [http://lesswrong.com/lw/" ) > 0 || substr_count( $NextArticleSummary, "'' (prerequisite: [http://lesswrong.com/lw" ) > 0 || substr_count( $NextArticleSummary, "'' and ''[http://lesswrong.com/lw" ) > 0 || strcmp( $NextArticleSummary, "setting up the problem." ) == 0 || strcmp( $NextArticleSummary, "[[Eliezer Yudkowsky]]" ) == 0 || strcmp( $NextArticleSummary, "by Salamon and Rayhawk." ) == 0 || strcmp( $NextArticleSummary, "by [[Eliezer Yudkowsky]]" ) == 0 || strcmp( $NextArticleSummary, "by talisman" ) == 0 || strcmp( $NextArticleSummary, "(short story)" ) == 0 || strcmp( $NextArticleSummary, "of a particular study design. Debiasing [http://lesswrong.com/lw/jk/burdensome_details/ won't be as simple] as practicing specific questions, it requires certain general habits of thought." ) == 0 || strcmp( $NextArticleSummary, "as practicing specific questions, it requires certain general habits of thought." ) == 0 || strcmp( $NextArticleSummary, "'' their single principle; but if they were ''really'' following ''only'' that single principle, they would [http://lesswrong.com/lw/kz/fake_optimization_criteria/ choose other acts to justify]." ) == 0 || strcmp( $NextArticleSummary, "all of their complicated ''other'' preferences into their choice of ''exactly'' which acts they try to ''[http://lesswrong.com/lw/kq/fake_justification/ justify using]'' their single principle; but if they were ''really'' following ''only'' that single principle, they would [http://lesswrong.com/lw/kz/fake_optimization_criteria/ choose other acts to justify]." ) == 0 || strcmp( $NextArticleSummary, "to this post tries to explain the cognitive twists whereby people [http://lesswrong.com/lw/ld/the_hidden_complexity_of_wishes/ smuggle] all of their complicated ''other'' preferences into their choice of ''exactly'' which acts they try to ''[http://lesswrong.com/lw/kq/fake_justification/ justify using]'' their single principle; but if they were ''really'' following ''only'' that single principle, they would [http://lesswrong.com/lw/kz/fake_optimization_criteria/ choose other acts to justify]." ) == 0 || strcmp( $NextArticleSummary, "[http://lesswrong.com/lw/n1/allais_malaise/ followups]) - Offered choices between gambles, people make decision-theoretically inconsistent decisions." ) == 0 || strcmp( $NextArticleSummary, ") - Offered choices between gambles, people make decision-theoretically inconsistent decisions." ) == 0 || strcmp( $NextArticleSummary, "and ''[http://lesswrong.com/lw/oo/explaining_vs_explaining_away/ Explaining vs. Explaining Away]'' - elementary [[reductionism]]." ) == 0 || strcmp( $NextArticleSummary, "\" which essentially answered \"Not on the present state of the Art\"" ) == 0 || strcmp( $NextArticleSummary, "(and its [[Privileging the hypothesis | requisites]], like [[Locating the hypothesis]])" ) == 0 || strcmp( $NextArticleSummary, "and ''[http://lesswrong.com/lw/hm/new_improved_lottery/ New Improved Lottery]" ) == 0 || strcmp( $NextArticleSummary, "their single principle; but if they were ''really'' following ''only'' that single principle, they would [http://lesswrong.com/lw/kz/fake_optimization_criteria/ choose other acts to justify]." ) == 0 || strcmp( $NextArticleSummary, "[http://lesswrong.com/lw/w6/recursion_magic/ ...Recursion, Magic]" ) == 0 || strcmp( $NextArticleSummary, "[http://lesswrong.com/lw/wf/hard_takeoff/ Hard Takeoff]" ) == 0 || strcmp( $NextArticleSummary, "[http://lesswrong.com/lw/wg/permitted_possibilities_locality/ Permitted Possibilities, & Locality]" ) == 0 || strcmp( $NextArticleSummary, "(in the martial arts)" ) == 0 || strcmp( $NextArticleSummary, "(in both psychotherapy and martial arts)" ) == 0 || strcmp( $NextArticleSummary, "Description and account of the game." ) == 0 || false ) { // don't add the invalid summary } else { $ArrayAllArticleUsedSummaries[$SearchResult][] = $NextArticleSummary; } } } } } // for authors who have their own wikipages, link to the wikipage instead of the LW user page $CorrectArticleAuthor = str_replace("[http://lesswrong.com/user/Eliezer_Yudkowsky Eliezer_Yudkowsky]", "[[Eliezer Yudkowsky]]", $CorrectArticleAuthor); // check if the link ends with a / if ( strcmp( $LastCharInLink, "/" ) != 0 && $SlashCountBeforeAdd < 6 ) { $ArticleLinksWithoutEndingSlash[$ArticleLinksWithoutEndingSlashNextIndex] = "*[$CurrentLink $CorrectArticleTitle] by $CorrectArticleAuthor"; $ArticleLinksWithoutEndingSlashConcept[$ArticleLinksWithoutEndingSlashNextIndex] = $CurrentTitle; $ArticleLinksWithoutEndingSlashNextIndex++; } if( strcmp($CurrentArticleTitle, $CorrectArticleTitle) != 0 && $LinkIsAfterHeader ) { // don't report an error for the following links: if ( substr_count( $CurrentArticleTitle, "ranges over anything, not just internal subjective experiences") <= 0 && substr_count( $CurrentArticleTitle, "sequence leading up") <= 0 && substr_count( $CurrentArticleTitle, "smuggle") <= 0 && substr_count( $CurrentArticleTitle, "justify using") <= 0 && substr_count( $CurrentArticleTitle, "choose other acts to justify") <= 0 && substr_count( $CurrentArticleTitle, "Timeless decision theory") <= 0 && substr_count( $CurrentArticleTitle, "philosophical majoritarianism") <= 0 && substr_count( $CurrentArticleTitle, "critical comments") <= 0 && substr_count( $CurrentArticleTitle, "Positive Bias") <= 0 && substr_count( $CurrentArticleTitle, "Hindsight Bias") <= 0 && substr_count( $CurrentArticleTitle, "not an isolated artifact") <= 0 && substr_count( $CurrentArticleTitle, "won't be as simple") <= 0 && substr_count( $CurrentArticleTitle, "Illusion of Transparency") <= 0 && substr_count( $CurrentArticleTitle, "Affect Heuristic") <= 0 && substr_count( $CurrentArticleTitle, "Evaluability") <= 0 && substr_count( $CurrentArticleTitle, "Unbounded Scales, Huge Jury Awards, and Futurism") <= 0 && substr_count( $CurrentArticleTitle, "subsequent") <= 0 && substr_count( $CurrentArticleTitle, "followups") <= 0 && substr_count( $CurrentArticleTitle, "Do We Believe <i>Everything</i> We're Told?") <= 0 && substr_count( $CurrentArticleTitle, "Quantum Physics") <= 0 && substr_count( $CurrentArticleTitle, "Shut Up and Do the Impossible") <= 0 && substr_count( $CurrentArticleTitle, "You ''Can'' Face Reality") <= 0 && substr_count( $CurrentArticleTitle, "Absence of Evidence ''Is'' Evidence of Absence") <= 0 && substr_count( $CurrentArticleTitle, "Doublethink: Choosing to be Biased") <= 0 && substr_count( $CurrentArticleTitle, "Anti-Epistemology") <= 0 && substr_count( $CurrentArticleTitle, "Is Humanism a Religion-Substitute?") <= 0 && substr_count( $CurrentArticleTitle, "Your Strength As A Rationalist") <= 0 && substr_count( $CurrentArticleTitle, "Absence of Evidence '''is''' Evidence of Absence") <= 0 && substr_count( $CurrentArticleTitle, "Reversed Stupidity is Not Intelligence") <= 0 && substr_count( $CurrentArticleTitle, "A Human's Guide to Words") <= 0 && substr_count( $CurrentArticleTitle, "here") <= 0 && true ) { $ArticleLinksWithWrongTitle[$ArticleLinksWithWrongTitleNextIndex] = "*[$CurrentLink $CorrectArticleTitle] by $CorrectArticleAuthor\n**(title was [$CurrentLink $CurrentArticleTitle])"; $ArticleLinksWithWrongTitleConcept[$ArticleLinksWithWrongTitleNextIndex] = $CurrentTitle; $ArticleLinksWithWrongTitleNextIndex++; if( $Debug ) { //fwrite( $stderr, "article title: $CurrentArticleTitle\ncorrect title: $CorrectArticleTitle\n\n" ); } } } //fwrite( $stderr, "article title: $CurrentArticleTitle\n\n" ); $ByCharacters = substr( $CurrentBody, $TitleEndPos, 6 ); $FirstHeaderPos = strpos($CurrentBody, "==", 0); // check if the link has an author // ignore missing authors if the link appears before any headers if( strcmp($ByCharacters, "] by [") != 0 && $LinkIsAfterHeader ) { $LineUpToAuthor = substr( $CurrentBody, $TokenStartPos-1, ($TitleEndPos+6)-($TokenStartPos-1) ); $LineUpToAuthor = str_replace("\n", "\\n", $LineUpToAuthor); $ArticleLinksWithoutAuthor[$ArticleLinksWithoutAuthorNextIndex] = "*[$CurrentLink $CorrectArticleTitle] by $CorrectArticleAuthor\n**(was $LineUpToAuthor)"; $ArticleLinksWithoutAuthorConcept[$ArticleLinksWithoutAuthorNextIndex] = $CurrentTitle; $ArticleLinksWithoutAuthorNextIndex++; if( $Debug ) { //fwrite( $stderr, "Concept: $CurrentTitle, by characters: <$ByCharacters>\n\n" ); } } //todo - change this to check if the author is correct! } //fwrite( $stderr, "link start: $TokenStartPos \n link end: $TokenEndPos \n current link: $CurrentLink\n\n" ); // find the current link in the array, if it exists $SearchResult = array_search( $CurrentLink, $ArrayAllArticleLink ); if( $SearchResult !== FALSE ) { // find if this concept exists in the array if( in_array( $CurrentTitle, $ArrayAllArticleIndexedConcepts[$SearchResult] ) ) { // mark the link as found $ArrayAllArticleFoundConcepts [$SearchResult][$CurrentTitle] = true; } else { // add the concept to the array of unindexed concepts $ArrayAllArticleNotIndexedConcepts[$SearchResult][] = $CurrentTitle; } } $TokenStartPos = strpos($CurrentBody, "http://lesswrong.com/lw/", $TokenEndPos); $TokenEndPos = strpos($CurrentBody, " ", $TokenStartPos); $CurrentLink = substr( $CurrentBody, $TokenStartPos, $TokenEndPos-$TokenStartPos ); if ( $TokenEndPos <= $TokenStartPos || $TokenStartPos === FALSE || substr_count( $CurrentLink, "http://lesswrong.com/lw/" ) <= 0 ) { $FinishedFindingLinks = true; } } } //$IsReadingConceptPages = false; //fwrite( $stderr, $CurrentBody ); //unfinished!!! //for each page: // first check if the page is in the $ConceptFound list // read the list of articles in the "Blog posts" section. // report any links that appear in the All Articles page, for that concept, but don't appear in the wiki page // report any links that appear in the wiki page, but don't appear in the All Articles page, for that concept $SuccessfullyReadOneConceptPage = true; } } } } function defaultHandler($parser, $data) { } function new_xml_parser($file) { global $parser_file; $xml_parser = xml_parser_create(); xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, 1); xml_set_element_handler($xml_parser, "startElement", "endElement"); xml_set_character_data_handler($xml_parser, "characterData"); xml_set_default_handler($xml_parser, "defaultHandler"); if( !($fp = @fopen($file, "r")) ) { return false; } if( !is_array($parser_file) ) { settype($parser_file, "array"); } $parser_file[$xml_parser] = $file; return array($xml_parser, $fp); } function ReadOnceThroughTheWholeXMLFile() { global $XMLfile; // create the XML parser if( !(list($xml_parser, $fp) = new_xml_parser($XMLfile)) ) { die("could not open XML input"); } // read the XML file while( $data = fread($fp, 4096) ) { if( !xml_parse($xml_parser, $data, feof($fp)) ) { die( sprintf("XML error: %s at line %d\n", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser))); } } // we're done with the XML file now, so close it fclose($fp); } fwrite( $stderr, "Reading through the All Articles pages\n\n" ); // first read through the All Articles pages $IsReadingAllArticlesPages = true; $NextYearToRead = $FirstYearToRead; $FailedToReadAllArticlesPage = false; // keep looping through the whole XML file // abort when we've successfully read all of the All Articles pages, // or if we've looped through the whole file without finding the next one // there's probably a more efficient way than looping repeatedly through the whole XML file, // but that would probably make the code even more tangled than it is now. // this looping should be unnecessary, because the All Articles pages should be in order in the XML file, but I'll leave this code as it is for now. while ( $NextYearToRead <= $FinalYearToRead && ! $FailedToReadAllArticlesPage ) { fwrite( $stderr, "processing the XML file, year $NextYearToRead \n\n" ); $SuccessfullyReadOneAllArticlesPage = false; ReadOnceThroughTheWholeXMLFile(); if( ! $SuccessfullyReadOneAllArticlesPage ) { $FailedToReadAllArticlesPage = true; fwrite( $stderr, "error: failed to read the next All Articles page, year $NextYearToRead \n\n" ); } } $IsReadingAllArticlesPages = false; fwrite( $stderr, "Reading through the Summaries pages\n\n" ); // next read through the summaries pages $IsReadingSummariesPages = true; $NextYearToRead = $FirstYearToRead; $FailedToReadSummariesPage = false; // keep looping through the whole XML file // abort when we've successfully read all of the Summaries pages, // or if we've looped through the whole file without finding the next one // there's probably a more efficient way than looping repeatedly through the whole XML file, // but that would probably make the code even more tangled than it is now. // this looping should be unnecessary, because the Summaries pages should be in order in the XML file, but I'll leave this code as it is for now. while ( $NextYearToRead <= $FinalYearToRead && ! $FailedToReadSummariesPage ) { fwrite( $stderr, "processing the XML file for summaries, year $NextYearToRead \n\n" ); $SuccessfullyReadOneSummariesPage = false; ReadOnceThroughTheWholeXMLFile(); if( ! $SuccessfullyReadOneSummariesPage ) { $FailedToReadSummariesPage = true; fwrite( $stderr, "error: failed to read the next summaries page, year $NextYearToRead \n\n" ); } } $IsReadingSummariesPages = false; fwrite( $stderr, "marking all concepts as not found yet\n\n" ); // now read through the array of concepts, and mark them all as unfound foreach( $ArrayAllArticleIndexedConcepts as $key => $val ) { foreach( $val as $key2 => $val2 ) { $ConceptFound[$val2] = false; } } fwrite( $stderr, "Reading through the Concept pages\n\n" ); // next read through the Concept pages $IsReadingConceptPages = true; $FailedToReadConceptPage = false; // just read once through the whole XML file // there's no need to read through the concept pages in any particular order ReadOnceThroughTheWholeXMLFile(); fwrite( $stderr, "Pages read: $PagesRead\n" ); if( ! $SuccessfullyReadOneConceptPage ) { $FailedToReadConceptPage = true; fwrite( $stderr, "error: failed to read any Concept page \n\n" ); } fwrite( $stderr, "Doing final processing, step 1 of 3...\n\n" ); // now go through the $ConceptFound array // for any concept that doesn't already have a wikipage, output a template for a blank page, with the "Blog posts" section filled in with all of the blog posts that mention this topic if( false ) { // this section was added just to clean up the pages that use the old OB link for Eliezer's post instead of the new LW link // disabled now because it gives too many false positives fwrite( $fp, "\n\n==The following concept pages link to OvercomingBias.com articles:==\n\n" ); foreach( $PagesWithOvercomingBiasLinks as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } } fwrite( $fp, "\n\n==The following concept pages have comments:==\n\n" ); foreach( $PagesWithComments as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } // this section was added just to clean up the pages that use the old OB link for Eliezer's post instead of the new LW link // disabled now because it gives too many false positives fwrite( $fp, "\n\n==The following concept pages have the \"Overcoming Bias Articles\" header:==\n\n" ); foreach( $PagesWithOvercomingBiasArticlesHeader as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concept pages have \"External references\" instead of \"References\":==\n\n" ); foreach( $PagesWithExternalReferences as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concept pages have a miscapitalized \"See Also\" header:==\n\n" ); foreach( $PagesWithSeeAlso as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concept pages have an author link that links to an external site:==\n\n" ); foreach( $PagesWithExternalAuthorLinks as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concept pages have an extra newline after the wikilink template:==\n\n" ); foreach( $PagesWithNewlineAfterWikiLink as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concept pages have the See Also section before the Blog Posts section:==\n\n" ); foreach( $PagesWithSeeAlsoBeforeBlogPosts as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } // disabled because of a weird bug that was introduced when I added the "see also" checking /* fwrite( $fp, "\n\n==The following article links are missing the / at the end, or aren't in the index:==\n\n" ); $PreviousConcept = ""; foreach( $ArticleLinksWithoutEndingSlash as $key => $val ) { if( strcmp( $PreviousConcept, $ArticleLinksWithoutEndingSlashConcept[$key] ) != 0 ) { fwrite( $fp, "\n*[[$ArticleLinksWithoutEndingSlashConcept[$key]]]\n" ); } fwrite( $fp, "*$val\n" ); $PreviousConcept = $ArticleLinksWithoutEndingSlashConcept[$key]; } */ fwrite( $fp, "\n\n==The following article links have a wrong or improperly formatted title:==\n\n" ); $PreviousConcept = ""; foreach( $ArticleLinksWithWrongTitle as $key => $val ) { if( strcmp( $PreviousConcept, $ArticleLinksWithWrongTitleConcept[$key] ) != 0 ) { fwrite( $fp, "\n*[[$ArticleLinksWithWrongTitleConcept[$key]]]\n" ); } fwrite( $fp, "*$val\n" ); $PreviousConcept = $ArticleLinksWithWrongTitleConcept[$key]; } fwrite( $fp, "\n\n==The following article links have a summary available that was not added to the page:==\n\n" ); $PreviousConcept = ""; foreach( $ArticleLinksWithAvailableSummary as $key => $val ) { if( strcmp( $PreviousConcept, $ArticleLinksWithAvailableSummaryConcept[$key] ) != 0 ) { fwrite( $fp, "\n*[[$ArticleLinksWithAvailableSummaryConcept[$key]]]\n" ); } fwrite( $fp, "*$val\n" ); $PreviousConcept = $ArticleLinksWithAvailableSummaryConcept[$key]; } if( false ) { /// this section was disabled because there were way too many false positives fwrite( $fp, "\n\n==The following article links have a missing or improperly formatted author:==\n\n" ); $PreviousConcept = ""; foreach( $ArticleLinksWithoutAuthor as $key => $val ) { if( strcmp( $PreviousConcept, $ArticleLinksWithoutAuthorConcept[$key] ) != 0 ) { fwrite( $fp, "\n*[[$ArticleLinksWithoutAuthorConcept[$key]]]\n" ); } fwrite( $fp, "*$val\n" ); $PreviousConcept = $ArticleLinksWithoutAuthorConcept[$key]; } } fwrite( $fp, "\n\n==The following concepts don't have wikipages with links to LessWrong.com articles yet:==\n\n" ); foreach( $ConceptFound as $key => $val ) { if( ! $val ) { fwrite( $fp, "*[[$key]]\n" ); } } fwrite( $fp, "\n\n==The following concepts are not in the All Articles pages:==\n\n" ); foreach( $ConceptNotInIndex as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $fp, "\n\n==The following concepts are in the All Articles page, but are redirects:==\n\n" ); foreach( $ConceptThatAreRedirects as $key => $val ) { fwrite( $fp, "*[[$key]]\n" ); } fwrite( $stderr, "Doing final processing, step 2 of 3...\n\n" ); // now output the list of which articles in the All Articles index are missing an entry: fwrite( $fp, "\n\n==The following articles in the [[Less Wrong/All Articles|All Articles]] index are missing an entry:==\n" ); foreach( $ArrayAllArticleNotIndexedConcepts as $key => $val ) { if( count( $val ) > 0 ) { fwrite( $fp, "\n*[$ArrayAllArticleLink[$key] $ArrayAllArticleTitle[$key]] is missing the following concepts:\n" ); foreach( $val as $key2 => $val2 ) { fwrite( $fp, "**[[$val2]]\n" ); } } } fwrite( $stderr, "Doing final processing, step 3 of 3...\n\n" ); // now output the list of which article links need to be added to the concept pages: fwrite( $fp, "\n\n==The following article links need to be added to the concept pages:==\n" ); // for each concept foreach( $ConceptFound as $key => $val ) { // make an array to store the missing links $MissingLinks = array(); // for each article foreach( $ArrayAllArticleIndexedConcepts as $key2 => $val2 ) { // check if the article contains the concept if( array_search($key, $val2) ) { //fwrite( $stderr, "concept: $key \n link: $ArrayAllArticleLink[$key2] \n title: $ArrayAllArticleTitle[$key2]\n\n" ); // check if the concept's link was not found if( ! array_key_exists($key, $ArrayAllArticleFoundConcepts[$key2]) ) { // add it to the list of concepts whose links were not found $MissingLinks[] = "**[$ArrayAllArticleLink[$key2] $ArrayAllArticleTitle[$key2]] by $ArrayAllArticleAuthor[$key2]"; //fwrite( $stderr, "concept: $key \n link: $ArrayAllArticleLink[$key2] \n title: $ArrayAllArticleTitle[$key2]\n\n" ); } } } // if there are any not found article links, then output the list if( count($MissingLinks) > 0 ) { fwrite( $fp, "\n*[[$key]] is missing the following article links:\n" ); foreach( $MissingLinks as $key3 => $val3 ) { fwrite( $fp, "$val3\n" ); } } } // now output the list of which article links need to be added to the concept pages: fwrite( $fp, "\n\n==The following See Also links only go one way:==\n" ); // for each concept foreach( $ConceptSeeAlso as $key => $val ) { foreach( $val as $key2 => $val2 ) { $Concept1 = $key; $Concept2 = $val2; $MatchFound = false; foreach( $ConceptSeeAlso as $key3 => $val3 ) { foreach( $val3 as $key4 => $val4 ) { $Concept3 = $key3; $Concept4 = $val4; if( $Concept1 == $Concept4 && $Concept2 == $Concept3 ) { $MatchFound = true; } } } if( !$MatchFound ) { fwrite( $fp, "\n*[[$Concept1]] -> [[$Concept2]]" ); } } } fwrite( $fp, "\n\n==The following is a list of all concept pages:==\n\n" ); $SortedConcepts = $ConceptFound; ksort($SortedConcepts); foreach( $SortedConcepts as $key => $val ) { if( $val ) { fwrite( $fp, "*[[$key]]\n" ); } } fwrite( $fp, "\n\n==Links to the All Articles pages:==\n" ); fwrite( $fp, "*[[Less Wrong/All Articles]]\n" ); fwrite( $fp, "*[[Less Wrong/2006 Articles]]\n" ); fwrite( $fp, "*[[Less Wrong/2007 Articles]]\n" ); fwrite( $fp, "*[[Less Wrong/2008 Articles]]\n" ); fwrite( $fp, "*[[Less Wrong/2009 Articles]]\n" ); fwrite( $fp, "*[[Less Wrong/2010 Articles]]\n" ); fwrite( $fp, "\n\n==Links to the Summaries pages:==\n" ); fwrite( $fp, "*[[Less Wrong/Article summaries]]\n" ); fwrite( $fp, "*[[Less Wrong/2006 Articles/Summaries]]\n" ); fwrite( $fp, "*[[Less Wrong/2007 Articles/Summaries]]\n" ); fwrite( $fp, "*[[Less Wrong/2008 Articles/Summaries]]\n" ); fwrite( $fp, "*[[Less Wrong/2009 Articles/Summaries]]\n" ); fwrite( $fp, "*[[Less Wrong/2010 Articles/Summaries]]\n" ); fclose($fp); $fp = fopen('ConceptGraph.dot.txt', 'w'); //{_COPYBLOCK1 fwrite($fp, "digraph G {\n\nnode [fontsize=\"$fontsize\"]\n\n"); // output the data for each concept foreach( $ConceptSeeAlso as $key => $val ) { $curLine = ""; $NameWithSpaces = $key; $NameWithoutSpaces = str_replace(" ", "_", $NameWithSpaces); $curName = "\"" . $NameWithoutSpaces . "\""; $curName = str_pad( $curName, $PaddingValue1 ); $curLine .= $curName; $curLine .= "[label=\""; $curName = $NameWithSpaces; $curName .= "\""; $curName = str_pad( $curName, $PaddingValue2 ); $curLine .= $curName; // $curLine .= ", color="; // // $curLine .= $NewArrayScenarioOutlineColors[$key]; // // $curLine .= ", shape="; // // $curLine .= $NewArrayScenarioShapes[$key]; // // $curLine .= ", style="; // // $curLine .= $NewArrayScenarioStyles[$key]; // // $curLine .= ", fillcolor=\"#"; // // $curLine .= $NewArrayScenarioFillColors[$key]; // // $curLine .= "\""; //if( strcmp($NewArrayScenarioNamesWithoutSpaces[$key], $ArrayScenarioNamesWithoutSpaces[$basekey]) == 0 ) //{ // $curLine .= ", peripheries=3"; //} $newURL = "http://wiki.lesswrong.com/wiki/" . str_replace(" ", "_", $NameWithSpaces); $curLine .= ", URL=\"$newURL\""; $curLine .= "];\n"; fwrite($fp, $curLine); } fwrite($fp, "\n\n"); // output the data for the edges foreach( $ConceptSeeAlso as $key => $val ) { foreach( $val as $key2 => $val2 ) { $curLine = ""; $Name1WithSpaces = $key; $Name1WithoutSpaces = str_replace(" ", "_", $Name1WithSpaces); $Name2WithSpaces = $val2; $Name2WithoutSpaces = str_replace(" ", "_", $Name2WithSpaces); $curName = "\"" . $Name1WithoutSpaces . "\""; $curName = str_pad( $curName, $PaddingValue1 ); $curLine .= $curName; $curLine .= "-> "; $curLine .= "\"" . $Name2WithoutSpaces . "\""; $curLine .= "\n"; fwrite($fp, $curLine); } } fwrite($fp, "\n\n}"); //}_COPYBLOCK1 fclose($fp); $fp = fopen('AllArticles.txt', 'w'); fwrite( $stderr, "Outputting new All Articles page...\n\n" ); $ProgressCounter = 0; foreach( $ArrayAllArticleTitle as $key => $val ) { $ConceptString = ""; $TempConceptArray = array(); foreach( $ArrayAllArticleIndexedConcepts[$key] as $key2 => $val2 ) { if( ! in_array($val2, $TempConceptArray) ) { $ConceptString .= "[[$val2]], "; $TempConceptArray[] = $val2; } } foreach( $ArrayAllArticleNotIndexedConcepts[$key] as $key2 => $val2 ) { if( ! in_array($val2, $TempConceptArray) ) { $ConceptString .= "[[$val2]], "; $TempConceptArray[] = $val2; } } if( strlen($ConceptString) > 2 ) { $ConceptString = substr($ConceptString, 0, -2); } fwrite( $fp, "|-valign=\"top\"\n" ); fwrite( $fp, "| [$ArrayAllArticleLink[$key] $ArrayAllArticleTitle[$key]]\n" ); fwrite( $fp, "| $ConceptString\n" ); fwrite( $fp, "| $ArrayAllArticleAuthor[$key]\n" ); if( $ProgressCounter % 100 == 0 ) { //fwrite( $stderr, "Progress: $ProgressCounter\n" ); } $ProgressCounter++; } fwrite( $stderr, "Progress: $ProgressCounter\n" ); fclose($fp); $fp = fopen('ArticleSummaries.txt', 'w'); fwrite( $stderr, "Outputting article summaries...\n\n" ); $ProgressCounter = 0; foreach( $ArrayAllArticleTitle as $key => $val ) { $ConceptString = ""; $TempConceptArray = array(); $IsFirstIteration = true; foreach( $ArrayAllArticleUsedSummaries[$key] as $key2 => $val2 ) { if( !$IsFirstIteration ) { fwrite( $fp, "\n\n(alternate summary:)\n\n" ); } else { fwrite( $fp, "\n\n=====[$ArrayAllArticleLink[$key] $ArrayAllArticleTitle[$key]]=====\n\n" ); $IsFirstIteration = false; } fwrite( $fp, $val2 ); } if( $ProgressCounter % 100 == 0 ) { //fwrite( $stderr, "Progress: $ProgressCounter\n" ); } $ProgressCounter++; } fwrite( $stderr, "Progress: $ProgressCounter\n" ); fclose($fp); $fp = fopen('ArticleSummaries2.txt', 'w'); fwrite( $stderr, "Outputting article summaries...\n\n" ); $ProgressCounter = 0; foreach( $ArrayAllArticleTitle as $key => $val ) { $ConceptString = ""; $TempConceptArray = array(); fwrite( $fp, "\n\n=====[$ArrayAllArticleLink[$key] $ArrayAllArticleTitle[$key]]=====\n\n" ); $IsFirstIteration = true; if( array_key_exists( $ArrayAllArticleTitle[$key], $ArrayAllArticleOfficialSummaries ) ) { foreach( $ArrayAllArticleOfficialSummaries[$ArrayAllArticleTitle[$key]] as $key2 => $val2 ) { if( !$IsFirstIteration ) { fwrite( $fp, "\n\n(alternate summary:)\n\n" ); } else { $IsFirstIteration = false; } fwrite( $fp, $val2 ); } } foreach( $ArrayAllArticleUsedSummaries[$key] as $key2 => $val2 ) { if( !$IsFirstIteration ) { fwrite( $fp, "\n\n(alternate summary:)\n\n" ); } else { $IsFirstIteration = false; } fwrite( $fp, $val2 ); } if( $ProgressCounter % 100 == 0 ) { //fwrite( $stderr, "Progress: $ProgressCounter\n" ); } $ProgressCounter++; } fwrite( $stderr, "Progress: $ProgressCounter\n" ); fclose($fp); //*/ fwrite( $stderr, "Processing Recent Post List...\n\n" ); $handle = fopen('http://lesswrong.com/recentposts', 'r'); $RecentPostRawData = ""; while( ( $buf = fread( $handle, 8192 ) ) != '' ) { $RecentPostRawData .= $buf; } if( $buf === FALSE ) { return "error reading file"; } fclose($handle); $fp = fopen('RecentPosts.txt', 'w'); // read the data from the recent posts page, then output the data in reverse order $ArrayRecentPostLines = array(); $PostLineStartPos = strpos( $RecentPostRawData, "<h3>", 0); $PostLineEndPos = $PostLineStartPos; $EndPos = strpos( $RecentPostRawData, "<p class=\"nextprev\">View more:", 0); $NextArticleTitle = ""; $IterationCount = 0; $PastTheEnd = false; while ( // ! in_array($NextArticleTitle, $ArrayAllArticleTitle) && $IterationCount < 1000 && ! $PastTheEnd && true ) { $PostLineStartPos = strpos( $RecentPostRawData, "<a href=\"", $PostLineEndPos ) + 9; $PostLineEndPos = strpos( $RecentPostRawData, "\"", $PostLineStartPos ); $NextArticleLink = substr( $RecentPostRawData, $PostLineStartPos, $PostLineEndPos-$PostLineStartPos ); $PostLineStartPos = strpos( $RecentPostRawData, ">", $PostLineEndPos ) + 1; $PostLineEndPos = strpos( $RecentPostRawData, "<", $PostLineStartPos ); $NextArticleTitle = substr( $RecentPostRawData, $PostLineStartPos, $PostLineEndPos-$PostLineStartPos ); $NextArticleTitle = str_replace(""", "\"", $NextArticleTitle); $PostLineStartPos = strpos( $RecentPostRawData, "<a href=\"", $PostLineEndPos ) + 9; $PostLineEndPos = strpos( $RecentPostRawData, "\"", $PostLineStartPos ); $NextArticleAuthorLink = substr( $RecentPostRawData, $PostLineStartPos, $PostLineEndPos-$PostLineStartPos ); $PostLineStartPos = strpos( $RecentPostRawData, ">", $PostLineEndPos ) + 1; $PostLineEndPos = strpos( $RecentPostRawData, "<", $PostLineStartPos ); $NextArticleAuthorName = substr( $RecentPostRawData, $PostLineStartPos, $PostLineEndPos-$PostLineStartPos ); if( $PostLineStartPos > $EndPos ) { $PastTheEnd = true; } else { $CurrentLineString = ""; $CurrentLineString .= "|-valign=\"top\"\n"; $CurrentLineString .= "| [http://lesswrong.com$NextArticleLink $NextArticleTitle]\n"; $CurrentLineString .= "| \n"; $CurrentLineString .= "| [$NextArticleAuthorLink $NextArticleAuthorName]\n"; //fwrite( $fp, $CurrentLineString ); $ArrayRecentPostLines[$IterationCount] = $CurrentLineString; $IterationCount++; } } //fwrite( $fp, "\n\n\n\n\n\n\n\n----------------------------------------\n\n\n\n\n\n\n\n" ); for( $index = $IterationCount-1; $index >= 0; $index-- ) //foreach( $ArrayRecentPostLines as $key => $val ) { fwrite( $fp, $ArrayRecentPostLines[$index] ); } fwrite( $stderr, "Done" ); fclose($fp); if( $Debug ) { if( true ) { $fp = fopen('Debug.txt', 'w'); fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ConceptFound \n\n\n\n\n\n\n\n\n\n" ); foreach( $ConceptFound as $key => $val ) { $data = $key . "\n" . print_r($val, true) . "\n"; fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ArrayAllArticleIndexedConcepts \n\n\n\n\n\n\n\n\n\n" ); foreach( $ArrayAllArticleIndexedConcepts as $key => $val ) { $data = $key . "\n" . print_r($val, true); fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ArrayAllArticleFoundConcepts \n\n\n\n\n\n\n\n\n\n" ); foreach( $ArrayAllArticleFoundConcepts as $key => $val ) { $data = $key . "\n" . print_r($val, true); fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ArrayAllArticleNotIndexedConcepts \n\n\n\n\n\n\n\n\n\n" ); foreach( $ArrayAllArticleNotIndexedConcepts as $key => $val ) { $data = $key . "\n" . print_r($val, true); fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ConceptSeeAlso \n\n\n\n\n\n\n\n\n\n" ); foreach( $ConceptSeeAlso as $key => $val ) { $data = $key . "\n" . print_r($val, true); fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ArrayAllArticleOfficialSummaries \n\n\n\n\n\n\n\n\n\n" ); foreach( $ArrayAllArticleOfficialSummaries as $key => $val ) { $data = $key . "\n" . print_r($val, true); fwrite( $fp, $data ); } fwrite( $fp, "\n\n\n\n\n\n\n\n\n\n ArrayAllArticleUsedSummaries \n\n\n\n\n\n\n\n\n\n" ); foreach( $ArrayAllArticleUsedSummaries as $key => $val ) { $data = $ArrayAllArticleTitle[$key] . "\n" . print_r($val, true); fwrite( $fp, $data ); } fclose($fp); } } fclose($stderr); //
?>