1

Further top this question: PDF::FromHTML - Corrupt file and no output

The code in question is 'working' in that it produces a PDF document just fine, just NONE of the HTML anchors are being translated, and on larger documents the processing ceases at Page 11 of the PDF - with no error, it closes the document just fine!

Edit: To save looking at the Question Link:

    # print "<p>".$textblob."</p>";

    $textblob='<html><head></head><body>'.$textblob.'</body></html>';

    # $textblob = decode('UTF-8', $textblob);

    my $output;
    if(defined($query->param('PDF'))){
        my $pdf = PDF::FromHTML->new( encoding => 'utf-8' );
        $pdf->load_file(\$textblob);
        $pdf->convert(
            # With PDF::API2, font names such as 'traditional' also works
             Font        => 'Arial',
             LineHeight  => 10,
             Landscape   => 0,
        );
        $pdf->write_file(\$output);
        print $output;
    }

$textblob when uncommented to print and commenting out the PDF section displays the full 400 reference adventure with links in html just fine...

Update:In desperation here is the entire script (it's not TOO long...)

#!/usr/bin/perl
use cPanelUserConfig;
use CGI::Carp qw(fatalsToBrowser);
use CGI;
use List::Util qw(shuffle);
use PDF::FromHTML;
require "authenticate.pl";

$query = new CGI;

if(defined($query->param('PDF'))){
    print $query->header(-type=>'application/pdf');
}
else{
    print $query->header(-charset=>'utf-8');
    &html_header;
    print "\n\n\n\n<!-- -------------------------- BEGIN: ff.net Script generated text ------------------------------------------- -->";
    print "Randomise working? Let me know if you find a bug.<br />";
}

if(defined($query->param('doc'))){
    $doc=$query->param('doc')."\nEOF";
    %refhash = $doc =~ /^[\n\s\t\.\#]*(\d+)[\s\t\.\#\n]+(?!\n*^[\n\s\t\.\#]*\d+[\s\t\.\#\n]+)(.+?)(?=^[\s\t\.\#\n]*\d+[\s\t\.\#\n]+|EOF)/smcgi; # refhash{key}=content, where key==refnumber and content==well, ref content
    &display_refhash(\%refhash);

}

elsif(defined($query->param('references'))){
    my %anchors;
    my $refhashref=&recreate_refhash($query->param('references'),\%anchors);

    if(defined($query->param('Randomise'))){
        $refhashref=&randomise($refhashref,\%anchors);
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);
    }
    elsif(defined($query->param('Save'))){
        &save($refhashref);
    }
    elsif(defined($query->param('Auto-HTML Tag'))){
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);
    }
    elsif(defined($query->param('Auto-ABML Tag'))){
        &autoABML($refhashref);
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);      
    }
    elsif(defined($query->param('PDF'))){
        &output_pdf($refhashref);
    }
    else{
        print "undefined function call";
    }
}
else{ # output form to input doc content
    print "Please input your document text into the textarea below (copy and paste should do it):";
    print '<form method="post" action="doc_to_refs.cgi" enctype="multipart/form-data" name="doc_to_refs_form">';
    print $query->textarea(-name=>'doc',-rows=>20,-cols=>100, -style=>"font-family:arial;width:98%");
    print $query->submit('Go!');
    print '</form>';
}

&html_footer;

# print "<!-- -------------------------- END: ff.net Script generated text ------------------------------------------- -->";    

sub recreate_refhash{
    my %refhash;
    my $references=shift;
    my $anchors_ref=shift;
    for(my $x=0;$x<$references;$x++){
        my $referencekey="reference"."$x";
        my $referencecontent="reftext"."$x";
        my $anchorname="anchor"."$x";
        my $deletename="delete"."$x";

        if(!defined($query->param($deletename))){
            $refhash{$query->param($referencekey)}=$query->param($referencecontent);

            if(defined($query->param($anchorname))){
                $$anchors_ref{$query->param($anchorname)}=$x;
            }
        }
    }

    return \%refhash;
}

sub randomise{
    my $refhashref=shift;
    my $anchor_ref=shift;
    my %refhash=%$refhashref;
    my %randomisedrefhash, %Xrefhash, @refstack, $ref;
    my %anchors=%$anchor_ref;

    # randomise the list
    @refstack=shuffle sort {$a <=> $b} keys %refhash; # inflict an order on the pre-shuffle (therefore we can xref predicatably?) not sure this makes ANY sense i'm melting....

    ## transpose anchors back to their required location
    for($x=0;$x<@refstack;$x++){
        if(defined($anchors{$refstack[$x]})){
            my $anchor=\$refstack[$anchors{$refstack[$x]}];
            my $temp=$refstack[$x];
            $refstack[$x]=$$anchor;

            print "---Swapping $temp with ".$$anchor;
            $$anchor=$temp;

            if(defined($anchors{$refstack[$x]})){
                if($refstack[$anchors{$refstack[$x]}] ne $$anchor){
                    $x--;
                }
            }
        }
    }

    ## randomise the refs and the content associations, and create the cross-ref hash
    foreach $ref(sort {$a <=> $b} keys %refhash){
        $key=shift @refstack;
        $randomisedrefhash{$ref}=$refhash{$key};
        $Xrefhash{$key}=$ref;
    }

    ## now do the content link substitutions
    foreach $ref(keys %randomisedrefhash){
        $randomisedrefhash{$ref}=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s)*(\d+)/&substitute_xref($1,$2,$3,$4,$5,\%Xrefhash)/egi;
    }

    print "You asked for the following anchors:";
    foreach $key(keys %anchors){
        print $anchors{$key};
    }

    return \%randomisedrefhash;
}

sub substitute_xref{ ## not sure that this is necessary but the verboseness was easier to work out
    my $pretext1=shift;
    my $pretext2=shift;
    my $pretext3=shift;
    my $pretext4=shift;
    my $link=shift;
    my $Xrefhashref=shift;
    my %Xrefhash=%$Xrefhashref;   

    my $newlink=$Xrefhash{$link};

    return "$pretext1$pretext2$pretext3$pretext4$newlink";

}

sub save{
    print "Will Save soon";
}

sub display_refhash{
    my $refhashref=shift;
    my %refhash=%$refhashref;

    print '<form method="post" action="doc_to_refs.cgi" enctype="multipart/form-data" name="doc_to_refs_form">';
    my $x=0;
    my $ref,$textblob;
    foreach $ref (sort {$a <=> $b} keys %refhash){
        my $reference="reference"."$x";
        my $reftext="reftext"."$x";
        my $anchor="anchor"."$x";
        my $delete="delete"."$x";
        my $default=$refhash{$ref};

        print "Reference is: ".$query->textfield(-name=>$reference,-value=>$ref, -override=>1)."<br />";
        print $query->checkbox_group(-name=>$delete,-values=>$ref,-labels=>{$ref=>'Delete Me'})."<br />";
        print $query->checkbox_group(-name=>$anchor,-values=>$ref, -labels=>{$ref=>'Anchor Me (Will NOT get Randomised)'})."<br />";
        print "Content is: ".$query->textarea(-name=>$reftext, -default=>$default, -rows=>5, -override=>1, -cols=>100, -style=>"font-family:arial;width:98%")."<br />";
        print "<br /><br />";

        if((defined($query->param('Auto-HTML Tag'))) or (defined($query->param('PDF')))){
            $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\<a href\=\"\#$5\"\>$1 $2 $3 $4 $5\<\/a\>/gi;
        }       

        if(defined($query->param('Auto-ABML Tag'))){
            # $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\&lt\;tt ref\=\"$5\"\&gt\;$1 $2 $3 $4 $5\&lt\;\/tt\&gt\;/gi;
        }       

        $textblob.=$ref." ".$default."<br /><br />";

        $x++;
    }

    print $query->hidden(-name=>'references',-value=>$x,override=>1);
#   print $query->submit(-name=>'Save');
    print $query->submit('Randomise');
    print $query->submit('Auto-ABML Tag');  
    print $query->submit('Auto-HTML Tag');  
    print $query->submit('PDF');    
    print "</form><br /><br /><br />";
    $textblob=~s/\n/\<br \/\>/gi;

    print "<p>".$textblob."</p>";

}

sub output_pdf{
    my $refhashref=shift;
    my %refhash=%$refhashref;

    my $x=0;
    my $ref,$textblob;
    foreach $ref (sort {$a <=> $b} keys %refhash){
        my $reference="reference"."$x";
        my $reftext="reftext"."$x";
        my $anchor="anchor"."$x";
        my $delete="delete"."$x";
        my $default=$refhash{$ref};

        if((defined($query->param('Auto-HTML Tag'))) or (defined($query->param('PDF')))){
            $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\<a href\=\"\#$5\"\>$1 $2 $3 $4 $5\<\/a\>/gi;
        }       

        if(defined($query->param('Auto-ABML Tag'))){
            # $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\&lt\;tt ref\=\"$5\"\&gt\;$1 $2 $3 $4 $5\&lt\;\/tt\&gt\;/gi;
        }       

        $textblob.=$ref." ".$default."<br /><br />";

        $x++;
    }

    $textblob=~s/\n/\<br \/\>/gi;

    # print "<p>".$textblob."</p>";

    $textblob='<html><head></head><body>'.$textblob.'</body></html>';

    my $output;
    if(defined($query->param('PDF'))){
        my $pdf = PDF::FromHTML->new( encoding => 'utf-8' );
        $pdf->load_file(\$textblob);
        $pdf->convert(
            # With PDF::API2, font names such as 'traditional' also works
             Font        => 'Arial',
             LineHeight  => 10,
             Landscape   => 0,
        );
        $pdf->write_file(\$output);
        print $output;
    }
}

sub html_header{

}

sub html_footer{

}

If you want sample data let me know I'll upload it somewhere

10
  • Please provide a minimal reproducible example Commented Sep 22, 2019 at 12:33
  • Uh - I geniunely don't really know how I can do that wihout reproducing the entire script and my sample data, beyond the snippet that was in the linked question? Sorry :-/
    – Beeblbrox
    Commented Sep 23, 2019 at 14:58
  • Added an edit for what it'sworth buit it's really no different from linked question
    – Beeblbrox
    Commented Sep 23, 2019 at 15:04
  • "..displays the full 400 reference adventure with links in html just fine." What is a 400 reference adventure? Commented Sep 23, 2019 at 16:09
  • @HåkonHægland - uh sorry that's a bit confusing - adventure gamebooks like Choose Your Own Adventure - not really relevant to the problem - in short it's a 150 page document that translates to HTML fine in the script but when I pipe the doc through the converter it cuts short at 11 pages and doesn't translate the links
    – Beeblbrox
    Commented Sep 23, 2019 at 17:05

1 Answer 1

1

"... and on larger documents the processing ceases at Page 11 of the PDF..."

This seems to be due to a bug in PDF::FromHTML::Template::Container::PageDef. Notice the line:

last if $::x++ > 10;

It means it will never create more than 11 pages. I have filed a bug report

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.