Beefy Boxes and Bandwidth Generously Provided by pair Networks
more useful options
 
PerlMonks  

Re^4: Perl 5 Optimizing Compiler, Part 5: A Vague Outline Emerges

by BrowserUk (Pope)
on Aug 31, 2012 at 16:27 UTC ( #991036=note: print w/ replies, xml ) Need Help??


in reply to Re^3: Perl 5 Optimizing Compiler, Part 5: A Vague Outline Emerges
in thread Perl 5 Optimizing Compiler, Part 5: A Vague Outline Emerges

Okay. I compiled run.c -> run.e using the cl /E. It produced 30,000 lines of post-precprocessed C. Most of which is unrelated to Perl having been pulled in from a crap load of OS header files. (I won't post it here, it's too big and entirely uninteresting anyway.)

I then threw that file at clang -- all 30,000 lines of it -- and asked it to convert it to LLVM assembler with no optimisation. It took most of the day cleaning up stuff that clang is really pedantic about -- it deosn't allow duplicate typedefs, even if they are identical except whitespace; it doesn't like MSC source code annotations or accept most of their pragmas; and it doesn't like prototypes without ; on the end ... and there were 1000s of them produced from the perl headers -- but 9 hours work and I got there.

It produced these 572 lines:

I then asked it to optimise it. This time it produced just these 363 lines:

Now looking at that, I can see that it has built data descriptions for a crapload of Windows internal data structures, so I've manually (and conservatively) removed anything that I don't think are used by Perl. (I could have done this at the /e stage, but it was *much* easier reading 700 lines that 30000 lines :)

What I've ended up with is these 112 lines of IR:

; ModuleID = '/tmp/webcompile/_9304_0.bc' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i6 +4:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f +80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %struct._TP_CALLBACK_ENVIRON = type { i64, %struct._TP_POOL*, %struct. +_TP_CLEANUP_GROUP*, void (i8*, i8*)*, i8*, %struct._ACTIVATION_CONTEX +T*, void (%struct._TP_CALLBACK_INSTANCE*, i8*)*, %union.anon } %struct._TP_POOL = type opaque %struct._TP_CLEANUP_GROUP = type opaque %struct._ACTIVATION_CONTEXT = type opaque %struct._TP_CALLBACK_INSTANCE = type opaque %union.anon = type { i64 } %struct.interpreter = type { %struct.sv**, %struct.op*, %struct.sv**, +%struct.sv**, %struct.sv**, i64*, i8**, i64, i64, %union.any*, i64, i +64, %struct.sv**, i64, i64, i64, i64, i64*, i64*, i64*, %struct.sv*, +%struct.xpv*, i64, %struct._stat64, %struct._stat64, %struct.gv*, %st +ruct.sv*, %struct.tms, %struct.pmop*, %struct.sv*, %struct.gv*, %stru +ct.gv*, %struct.gv*, i8*, %struct.sv*, %struct.sv*, %struct.sv*, %str +uct.hv*, %struct.hv*, %struct.op*, %struct.jmpenv*, %struct.cop*, %st +ruct.av*, %struct.stackinfo*, %struct.av*, %struct.jmpenv*, %struct.j +mpenv, %struct.sv*, %struct.he*, %struct.op*, %struct.op*, %struct.hv +*, %struct.gv*, %struct.gv*, i8*, i64, i64*, i64*, %struct.sv*, %stru +ct.re_save_state, %struct.regnode, i16, i8, i8, [6 x i8*], void (%str +uct.interpreter*, %struct.op*)*, void (%struct.interpreter*, %struct. +op*)*, void (%struct.interpreter*, %struct.op*)*, i64, i64, i8**, i8* +, %struct.regmatch_slab*, %struct.regmatch_state*, i16, i8, i8, i8, i +8, i32, i8, i64, i32, i8**, %struct.gv*, %struct.gv*, %struct.gv*, i8 +*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, i8**, i8*, i8, + i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8*, %struct +.sv*, i64, %struct.sv*, i64, i64, i64, i32, i32*, %struct.gv*, %struc +t.gv*, %struct.gv*, %struct.gv*, %struct.gv*, %struct.av*, %struct.gv +*, %struct.gv*, %struct.gv*, %struct.gv*, %struct.gv*, %struct.sv*, % +struct.sv*, %struct.sv*, %struct.av*, %struct.hv*, %struct.hv*, %stru +ct.sv*, %struct.av*, %struct.av*, %struct.av*, %struct.av*, %struct.a +v*, %struct.hv*, i64, i32, i64, i64, %struct.sv*, %struct.sv*, %struc +t.av*, i8*, %struct.cv*, %struct.op*, %struct.op*, %struct.op*, %stru +ct.op*, %struct.cop*, i32, i32, i8*, i8**, i8*, %struct.av*, %struct. +sv*, %struct.sv*, i64, i8, i8, i16, i32, i64, %struct.exitlistentry*, + %struct.hv*, i64*, %struct.cop, %struct.cv*, %struct.av*, %struct.av +*, i64, i64, %struct.interp_intern, %struct.cv*, i32, i8, i8, i8, i8, + i64, i64, i64, i64, i64, i64, i64, i64, i8**, i8*, void (i32)*, [16 +x i8*], i64, i32, {}*, %struct.sv, %struct.sv, %struct.sv, %struct.sv +*, i64, i64, i64, i64, i64, i64, i64, i64, i64, i8*, i64, i64, i64, i +8, i8, i8, i8, i8*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv +*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, % +struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %stru +ct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.s +v*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, +%struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %struct.sv*, %str +uct.hv*, i8*, i64, [10 x i8], i8, i8, i32, %struct.yy_parser*, %struc +t.sv**, %struct.sv**, %struct.ptr_tbl*, %struct.av*, i8*, %struct.sv* +, %struct.sv**, %struct.av*, %struct.REENTR*, %struct.hv*, %struct.hv +*, %struct._PerlIO*, %struct.PerlIO_list_s*, %struct.PerlIO_list_s*, +%struct.sv*, %struct.perl_debug_pad, %struct.sv*, %struct.sv*, %struc +t.sv*, %struct.sv*, i64 (%struct.interpreter*, %struct.sv*, %struct.s +v*)*, %struct.av*, %struct.av*, i64, i64, i32, %struct.hv*, void (%st +ruct.interpreter*, %struct.sv*)*, void (%struct.interpreter*, %struct +.sv*)*, void (%struct.interpreter*, %struct.sv*)*, {}*, void (%struct +.interpreter*)*, i64, i64, %struct.hv*, i32, i8**, i8 (%struct.interp +reter*, %struct.sv*)*, %struct.hv*, %struct.av*, %struct.hv*, %struct +.hv*, %struct.hv* } %struct.sv = type { i8*, i64, i64, %union.anon.0 } %union.anon.0 = type { i8* } %struct.op = type { %struct.op*, %struct.op*, %struct.op* (%struct.int +erpreter*)*, i64, [2 x i8], i8, i8 } %union.any = type { i8* } %struct.xpv = type { %struct.hv*, %union._xmgu, i64, i64 } %struct.hv = type { %struct.xpvhv*, i64, i64, %union.anon.3 } %struct.xpvhv = type { %struct.hv*, %union._xmgu, i64, i64 } %union._xmgu = type { %struct.magic* } %struct.magic = type { %struct.magic*, %struct.mgvtbl*, i16, i8, i8, i +64, %struct.sv*, i8* } %struct.mgvtbl = type { i32 (%struct.interpreter*, %struct.sv*, %struc +t.magic*)*, i32 (%struct.interpreter*, %struct.sv*, %struct.magic*)*, + i64 (%struct.interpreter*, %struct.sv*, %struct.magic*)*, i32 (%stru +ct.interpreter*, %struct.sv*, %struct.magic*)*, i32 (%struct.interpre +ter*, %struct.sv*, %struct.magic*)*, i32 (%struct.interpreter*, %stru +ct.sv*, %struct.magic*, %struct.sv*, i8*, i64)*, i32 (%struct.interpr +eter*, %struct.magic*, %struct.clone_params*)*, i32 (%struct.interpre +ter*, %struct.sv*, %struct.magic*)* } %struct.clone_params = type { %struct.av*, i64, %struct.interpreter*, +%struct.interpreter*, %struct.av* } %struct.av = type { %struct.xpvav*, i64, i64, %union.anon.2 } %struct.xpvav = type { %struct.hv*, %union._xmgu, i64, i64, %struct.sv +** } %union.anon.2 = type { i8* } %union.anon.3 = type { i8* } %struct._stat64 = type { i32, i16, i16, i16, i16, i16, i32, i64, i64, +i64, i64 } %struct.gv = type { %struct.xpvgv*, i64, i64, %union.anon.7 } %struct.xpvgv = type { %struct.hv*, %union._xmgu, i64, i64, %union._xi +vu, %union._xnvu } %union._xivu = type { i64 } %union._xnvu = type { %struct.anon.5 } %struct.anon.5 = type { i64, i64 } %union.anon.7 = type { i8* } %struct.tms = type { i64, i64, i64, i64 } %struct.pmop = type { %struct.op*, %struct.op*, %struct.op* (%struct.i +nterpreter*)*, i64, [2 x i8], i8, i8, %struct.op*, %struct.op*, i64, +i64, %union.anon.12, %union.anon.13 } %union.anon.12 = type { %struct.op* } %union.anon.13 = type { %struct.op* } %struct.jmpenv = type { %struct.jmpenv*, [16 x i32], i32, i8 } %struct.cop = type { %struct.op*, %struct.op*, %struct.op* (%struct.in +terpreter*)*, i64, [2 x i8], i8, i8, i64, i8*, i8*, i64, i64, i64*, % +struct.refcounted_he* } %struct.refcounted_he = type opaque %struct.stackinfo = type { %struct.av*, %struct.context*, %struct.stac +kinfo*, %struct.stackinfo*, i64, i64, i64, i64 } %struct.context = type { %union.anon.14 } %union.anon.14 = type { %struct.block } %struct.block = type { i8, i8, i16, i64, %struct.cop*, i64, i64, %stru +ct.pmop*, %union.anon.15 } %union.anon.15 = type { %struct.block_sub } %struct.block_sub = type { %struct.op*, %struct.cv*, %struct.av*, %str +uct.av*, i64, %struct.av* } %struct.cv = type { %struct.xpvcv*, i64, i64, %union.anon.11 } %struct.xpvcv = type { %struct.hv*, %union._xmgu, i64, i64, %struct.hv +*, %union.anon.9, %union.anon.10, %struct.gv*, i8*, %struct.av*, %str +uct.cv*, i64, i16, i64 } %union.anon.9 = type { %struct.op* } %union.anon.10 = type { %struct.op* } %union.anon.11 = type { i8* } %struct.he = type { %struct.he*, %struct.hek*, %union.anon.1 } %struct.hek = type { i64, i64, [1 x i8] } %union.anon.1 = type { %struct.sv* } %struct.re_save_state = type { i64, i64, i64, i8, i8*, i8*, i8*, %stru +ct.regexp_paren_pair*, i64*, i64*, i8**, %struct.magic*, %struct.pmop +*, %struct.pmop*, i8*, i64, i64, i64, i64, i64, i64, i8*, i8* } %struct.regexp_paren_pair = type { i64, i64 } %struct.regnode = type { i8, i8, i16 } %struct.regmatch_slab = type { [42 x %struct.regmatch_state], %struct. +regmatch_slab*, %struct.regmatch_slab* } %struct.regmatch_state = type { i32, i8*, %union.anon.22 } %union.anon.22 = type { %struct.anon.26 } %struct.anon.26 = type { %struct.regmatch_state*, i64, i64, i64, i16*, + %struct.regnode*, %struct.regnode*, i8*, i64, i16, i16, i8 } %struct.exitlistentry = type { void (%struct.interpreter*, i8*)*, i8* +} %struct.interp_intern = type { i8*, i8**, i64, %struct.av*, %struct.ch +ild_tab*, i64, %struct.pseudo_child_tab*, i8*, %struct.thread_intern, + %struct.HWND__*, i32, i32, [27 x void (i32)*] } %struct.child_tab = type { i64, [64 x i64], [64 x i8*] } %struct.pseudo_child_tab = type { i64, [64 x i64], [64 x i8*], [64 x % +struct.HWND__*], [64 x i8] } %struct.HWND__ = type { i32 } %struct.thread_intern = type { [512 x i8], %struct.servent, [128 x i8] +, i32, [30 x i8], i32, i16 } %struct.servent = type { i8*, i8**, i16, i8* } %struct.yy_parser = type { %struct.yy_parser*, %union.YYSTYPE, i32, i3 +2, i32, i32, %struct.yy_stack_frame*, %struct.yy_stack_frame*, i64, i +64, i8*, i8*, i8, i8, i8, i8, i64, %struct.op*, %struct.op*, %struct. +sv*, i16, i16, i64, %struct.sv*, i64, i64, i8, i8, i8, i8, i64, %stru +ct._sublex_info, %struct.sv*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i64, + i16, i8, i8, %struct.hv*, %struct._PerlIO**, %struct.av*, [5 x %unio +n.YYSTYPE], [5 x i64], i64, %struct.cop*, [256 x i8], i8, i8 } %union.YYSTYPE = type { i64 } %struct.yy_stack_frame = type { %union.YYSTYPE, i16, i64, %struct.cv* +} %struct._sublex_info = type { i8, i16, %struct.op*, i8*, i8* } %struct._PerlIO = type opaque %struct.ptr_tbl = type { %struct.ptr_tbl_ent**, i64, i64, %struct.ptr_ +tbl_arena*, %struct.ptr_tbl_ent*, %struct.ptr_tbl_ent* } %struct.ptr_tbl_ent = type { %struct.ptr_tbl_ent*, i8*, i8* } %struct.ptr_tbl_arena = type opaque %struct.REENTR = type { i32 } %struct.PerlIO_list_s = type opaque %struct.perl_debug_pad = type { [3 x %struct.sv] } define i64 @HEAP_MAKE_TAG_FLAGS(i64 %TagBase, i64 %Tag) nounwind uwtab +le readnone { %1 = shl i64 %Tag, 18 %2 = add i64 %1, %TagBase ret i64 %2 } define i32 @Perl_runops_standard(%struct.interpreter* %my_perl) nounwi +nd uwtable { %1 = getelementptr inbounds %struct.interpreter* %my_perl, i64 0, i3 +2 1 %2 = load %struct.op** %1, align 8, !tbaa !3 br label %3 ; <label>:3 ; preds = %3, %0 %op.0 = phi %struct.op* [ %2, %0 ], [ %6, %3 ] %4 = getelementptr inbounds %struct.op* %op.0, i64 0, i32 2 %5 = load %struct.op* (%struct.interpreter*)** %4, align 8, !tbaa !3 %6 = tail call %struct.op* %5(%struct.interpreter* %my_perl) nounwin +d store %struct.op* %6, %struct.op** %1, align 8, !tbaa !3 %7 = icmp eq %struct.op* %6, null br i1 %7, label %8, label %3 ; <label>:8 ; preds = %3 %9 = getelementptr inbounds %struct.interpreter* %my_perl, i64 0, i3 +2 78 store i8 0, i8* %9, align 1, !tbaa !0 ret i32 0 } !0 = metadata !{metadata !"omnipotent char", metadata !1} !1 = metadata !{metadata !"Simple C/C++ TBAA", null} !2 = metadata !{metadata !"long", metadata !0} !3 = metadata !{metadata !"any pointer", metadata !0} !4 = metadata !{metadata !"long long", metadata !0}

Take a close look at the data definitions for Interpreter, SV_any, HEK, COP etc. Isn't that the most concise (and thorough) description of the entire perl internals you've ever seen?

Doesn't it nake you wonder (just a little), what could it do with all that information?


With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.

[http://thebottomline.cpaaustra


Comment on Re^4: Perl 5 Optimizing Compiler, Part 5: A Vague Outline Emerges
Select or Download Code
Re^5: Perl 5 Optimizing Compiler, Part 5: A Vague Outline Emerges
by dave_the_m (Parson) on Aug 31, 2012 at 17:05 UTC
    ake a close look at the data definitions for Interpreter, SV_any, HEK, COP etc. Isn't that the most concise (and thorough) description of the entire perl internals you've ever seen? Doesn't it nake you wonder (just a little), what could it do with all that information?
    No, not in the slightest. I think this is the fundamental impedence mismatch between you and me on this subject.

    You keep asserting that if LLVM can only be given a full picture of the program, it will be able to do (unspecified) wonderful things with it. I come from the viewpoint that for perl to perform a particular action, e.g. my $x = $h{$key}, there are a certain basic number of things the underlying hardware is going to have to at some point, such as calculate a hash index, index into the hash bucket, scan the hash bucket for a matching string, retrieve the associated SV, then copy the relevant part of the SV (e.g. its integer value, or its string buffer) into the SV stored in a pad somewhere that's associated with the name $x.

    Now at the moment, there's a certain amount of overhead associated with doing that via the perl stack and the perl runops loop, but there's still a basic mimimum that needs doing. I earlier showed that the overhead is probably less than 20%. To get better than this, LLVM has got to, in some magical way, cut into basic underlying operations like hash lookup. And I really don't think its going to that.

    Until someone provides me with a single actual concrete example of how LLVM might do anything better than just cut out a bit of that overhead, I'm not going to believe it.

    PS I completely fail to understand the the point of your showing how run.c gets converted to IR. run.c just contains a single trivial C function, operating on a few args and variables of particular types, and the IR knows the types of those args. So what?

    Dave

      No, not in the slightest. I think this is the fundamental impedence mismatch between you and me on this subject.

      Having spent the last 9 hours to produce what I posted, having you dismiss it in under 10 minutes -- barely enough time to read the post, never mind look at the code with any attention to detail -- is ... well ... let's just say, disappointing and leave it at that.

      You keep asserting that if LLVM can only be given a full picture of the program, it will be able to do (unspecified) wonderful things with it.

      Firstly, I said may, not "will". It is "(unspecified)" because -- as I've been at pains to state often & clearly -- noone yet knows. I've provided a long list of possibilities, cited documents with examples of them. But I am not a computer and cannot possibly be expected to mentally run hundreds of thousands of lines of Perl sources through a dozen or more optimisation techniques and pick out salient examples to satisfy your demands for the instant gratification of "a concrete example".

      All I've sought from you, is your knowledge and expertise of and with the existing codebase, to enable the investigations to get a quick, clean start.

      PS I completely fail to understand the the point of your showing how run.c gets converted to IR. run.c just contains a single trivial C function, operating on a few args and variables of particular types, and the IR knows the types of those args. So what?

      The "what" is, that if it can encapsulate and annotate the entire internal (data) structure of Perl, in 100 lines of language independent, platform independent, eminently human readable, metadata, then it can also rewrite those structures and the function trees that use them in ways that neither a C compiler, nor a C programmer -- no matter how experienced -- could ever imagine doing.


      With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
      Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
      "Science is about questioning the status quo. Questioning authority".
      In the absence of evidence, opinion is indistinguishable from prejudice.

      RIP Neil Armstrong

        Yes, but Dave's point is that that's not the slow part of Perl 5. You either generate LLVM code specific to the semantic knowledge of Perl 5 programs or you precompile any Perl 5 program you want to optimize to bitcode and let LLVM do whole program analysis. In the latter case, you may be able to get more than 20% improvements if enough of your code stays in analyzable chunks and you can perform optimizations like dead code elimination and hoistings. In the former case, you have to tell LLVM about the semantics of the language because it can't infer them from a big wad of C code.

        LLVM may be able to optimize the source code of the Perl 5 VM and that may be worthwhile, but that doesn't necessarily provide a measurable improvement to programs written in Perl 5.

        (Let me belabor this point with what should be a really obvious example.)

        Suppose I wrote a program that reads a multi-gigabyte file line by line to pick out a handful of records. No matter how much I microoptimize my code to get rid of dispatch overhead, I'm only going to get real speed benefits one of two ways: if I fix that algorithmic problem by hand, or if some very clever analysis of the code by something which understands the semantics of the code can rewrite it for me.

        Unfortunately, LLVM doesn't understand the semantics of the Perl 5 language. That's why you either have to get the entire program into bitcode (so that it can infer what's going on at a level it does understand) or build up some sort of LLVM-aware VM that can map the semantics of the language to LLVM's intermediate form such that intelligent JITting is possible.

        Unless you do one of those two things, your theoretical maximum improvements are in the "Hey, that's a nice improvement, but it doesn't transform things" arena. Maybe in certain cases people are willing to spend a couple of hours on a powerful machine to compile an important program to bitcode and then to native code, but I'm not sure that's the end goal here.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://991036]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others having an uproarious good time at the Monastery: (4)
As of 2014-12-26 23:40 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (176 votes), past polls