PySpark startup time: 1.6745190620422363
Pyspark job time: 303.97684502601624 s
== Parsed Logical Plan ==
Project [value#0, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, content_size#183]
+- Filter isnotnull(response_code#143)
   +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, coalesce(content_size#156, cast(0 as int)) AS content_size#183]
      +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, coalesce(cast(trim(cols#2[9], None) as int), 0) AS content_size#156]
         +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, cast(trim(cols#2[8], None) as int) AS response_code#143]
            +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, substring(protocol#119, 0, (length(protocol#119) - 1)) AS protocol#131]
               +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, trim(cols#2[7], None) AS protocol#119]
                  +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, regexp_replace(endpoint#86, ^/~[^/]+, randomstr#97) AS endpoint#108, randomstr#97]
                     +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#86, CreateRandomID() AS randomstr#97]
                        +- Filter (length(endpoint#86) > 0)
                           +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, trim(cols#2[6], None) AS endpoint#86]
                              +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, substring(method#68, 2, 1000) AS method#77]
                                 +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, trim(cols#2[5], None) AS method#68]
                                    +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, substring(date#52, 2, (length(date#52) - 1)) AS date#60]
                                       +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, substring(date#44, 0, (length(date#44) - 1)) AS date#52]
                                          +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, trim(date#36, None) AS date#44]
                                             +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, concat(cols#2[3],  , cols#2[4]) AS date#36]
                                                +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, trim(cols#2[2], None) AS user_id#29]
                                                   +- Project [value#0, cols#2, ip#5, BadIPs#9, trim(cols#2[1], None) AS client_id#23]
                                                      +- Join Inner, (ip#5 = BadIPs#9)
                                                         :- Project [value#0, cols#2, trim(cols#2[0], None) AS ip#5]
                                                         :  +- Project [value#0, split(value#0,  ) AS cols#2]
                                                         :     +- Relation[value#0] text
                                                         +- Relation[BadIPs#9] csv

== Analyzed Logical Plan ==
value: string, ip: string, BadIPs: string, client_id: string, user_id: string, date: string, method: string, endpoint: string, randomstr: string, protocol: string, response_code: int, content_size: int
Project [value#0, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, content_size#183]
+- Filter isnotnull(response_code#143)
   +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, coalesce(content_size#156, cast(0 as int)) AS content_size#183]
      +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, response_code#143, coalesce(cast(trim(cols#2[9], None) as int), 0) AS content_size#156]
         +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, protocol#131, cast(trim(cols#2[8], None) as int) AS response_code#143]
            +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, substring(protocol#119, 0, (length(protocol#119) - 1)) AS protocol#131]
               +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#108, randomstr#97, trim(cols#2[7], None) AS protocol#119]
                  +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, regexp_replace(endpoint#86, ^/~[^/]+, randomstr#97) AS endpoint#108, randomstr#97]
                     +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, endpoint#86, CreateRandomID() AS randomstr#97]
                        +- Filter (length(endpoint#86) > 0)
                           +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, method#77, trim(cols#2[6], None) AS endpoint#86]
                              +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, substring(method#68, 2, 1000) AS method#77]
                                 +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, date#60, trim(cols#2[5], None) AS method#68]
                                    +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, substring(date#52, 2, (length(date#52) - 1)) AS date#60]
                                       +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, substring(date#44, 0, (length(date#44) - 1)) AS date#52]
                                          +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, trim(date#36, None) AS date#44]
                                             +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, user_id#29, concat(cols#2[3],  , cols#2[4]) AS date#36]
                                                +- Project [value#0, cols#2, ip#5, BadIPs#9, client_id#23, trim(cols#2[2], None) AS user_id#29]
                                                   +- Project [value#0, cols#2, ip#5, BadIPs#9, trim(cols#2[1], None) AS client_id#23]
                                                      +- Join Inner, (ip#5 = BadIPs#9)
                                                         :- Project [value#0, cols#2, trim(cols#2[0], None) AS ip#5]
                                                         :  +- Project [value#0, split(value#0,  ) AS cols#2]
                                                         :     +- Relation[value#0] text
                                                         +- Relation[BadIPs#9] csv

== Optimized Logical Plan ==
Project [value#0, ip#5, BadIPs#9, trim(cols#2[1], None) AS client_id#23, trim(cols#2[2], None) AS user_id#29, substring(substring(trim(concat(cols#2[3],  , cols#2[4]), None), 0, (length(trim(concat(cols#2[3],  , cols#2[4]), None)) - 1)), 2, (length(substring(trim(concat(cols#2[3],  , cols#2[4]), None), 0, (length(trim(concat(cols#2[3],  , cols#2[4]), None)) - 1))) - 1)) AS date#60, substring(trim(cols#2[5], None), 2, 1000) AS method#77, regexp_replace(trim(cols#2[6], None), ^/~[^/]+, pythonUDF1#240) AS endpoint#108, pythonUDF1#240 AS randomstr#97, substring(trim(cols#2[7], None), 0, (length(trim(cols#2[7], None)) - 1)) AS protocol#131, cast(trim(cols#2[8], None) as int) AS response_code#143, coalesce(coalesce(cast(trim(cols#2[9], None) as int), 0), 0) AS content_size#183]
+- BatchEvalPython [CreateRandomID(), CreateRandomID()], [value#0, cols#2, ip#5, BadIPs#9, pythonUDF0#239, pythonUDF1#240]
   +- Join Inner, (ip#5 = BadIPs#9)
      :- Project [value#0, split(value#0,  ) AS cols#2, trim(split(value#0,  )[0], None) AS ip#5]
      :  +- Filter (((length(trim(split(value#0,  )[6], None)) > 0) && isnotnull(cast(trim(split(value#0,  )[8], None) as int))) && isnotnull(trim(split(value#0,  )[0], None)))
      :     +- Relation[value#0] text
      +- Filter isnotnull(BadIPs#9)
         +- Relation[BadIPs#9] csv

== Physical Plan ==
*(3) Project [value#0, ip#5, BadIPs#9, trim(cols#2[1], None) AS client_id#23, trim(cols#2[2], None) AS user_id#29, substring(substring(trim(concat(cols#2[3],  , cols#2[4]), None), 0, (length(trim(concat(cols#2[3],  , cols#2[4]), None)) - 1)), 2, (length(substring(trim(concat(cols#2[3],  , cols#2[4]), None), 0, (length(trim(concat(cols#2[3],  , cols#2[4]), None)) - 1))) - 1)) AS date#60, substring(trim(cols#2[5], None), 2, 1000) AS method#77, regexp_replace(trim(cols#2[6], None), ^/~[^/]+, pythonUDF1#240) AS endpoint#108, pythonUDF1#240 AS randomstr#97, substring(trim(cols#2[7], None), 0, (length(trim(cols#2[7], None)) - 1)) AS protocol#131, cast(trim(cols#2[8], None) as int) AS response_code#143, coalesce(coalesce(cast(trim(cols#2[9], None) as int), 0), 0) AS content_size#183]
+- BatchEvalPython [CreateRandomID(), CreateRandomID()], [value#0, cols#2, ip#5, BadIPs#9, pythonUDF0#239, pythonUDF1#240]
   +- *(2) BroadcastHashJoin [ip#5], [BadIPs#9], Inner, BuildRight
      :- *(2) Project [value#0, split(value#0,  ) AS cols#2, trim(split(value#0,  )[0], None) AS ip#5]
      :  +- *(2) Filter (((length(trim(split(value#0,  )[6], None)) > 0) && isnotnull(cast(trim(split(value#0,  )[8], None) as int))) && isnotnull(trim(split(value#0,  )[0], None)))
      :     +- *(2) FileScan text [value#0] Batched: false, Format: Text, Location: InMemoryFileIndex[file:/data/logs/2000.01.01.txt, file:/data/logs/2000.01.02.txt, file:/data/logs..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>
      +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
         +- *(1) Project [BadIPs#9]
            +- *(1) Filter isnotnull(BadIPs#9)
               +- *(1) FileScan csv [BadIPs#9] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/data/logs/ip_blacklist.csv], PartitionFilters: [], PushedFilters: [IsNotNull(BadIPs)], ReadSchema: struct<BadIPs:string>
{"startupTime": 1.6745190620422363, "jobTime": 303.97684502601624}
