PySpark startup time: 1.684901475906372
Pyspark job time: 706.9621119499207 s
== Parsed Logical Plan ==
Join Inner, (ip#2 = BadIPs#62)
:- Project [ip#2, client_id#3, user_id#4, date#5, method#6, regexp_replace(endpoint#7, ^/~[^/]+, randomstr#40) AS endpoint#51, protocol#8, response_code#9, content_size#29, randomstr#40]
:  +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, content_size#29, CreateRandomID() AS randomstr#40]
:     +- Filter NOT (endpoint#7 = )
:        +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, coalesce(content_size#10, cast(0 as int)) AS content_size#29]
:           +- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int) AS content_size#10]
:              +- Relation[value#0] text
+- Relation[BadIPs#62] csv

== Analyzed Logical Plan ==
ip: string, client_id: string, user_id: string, date: string, method: string, endpoint: string, protocol: string, response_code: int, content_size: int, randomstr: string, BadIPs: string
Join Inner, (ip#2 = BadIPs#62)
:- Project [ip#2, client_id#3, user_id#4, date#5, method#6, regexp_replace(endpoint#7, ^/~[^/]+, randomstr#40) AS endpoint#51, protocol#8, response_code#9, content_size#29, randomstr#40]
:  +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, content_size#29, CreateRandomID() AS randomstr#40]
:     +- Filter NOT (endpoint#7 = )
:        +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, coalesce(content_size#10, cast(0 as int)) AS content_size#29]
:           +- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int) AS content_size#10]
:              +- Relation[value#0] text
+- Relation[BadIPs#62] csv

== Optimized Logical Plan ==
Join Inner, (ip#2 = BadIPs#62)
:- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_replace(regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1), ^/~[^/]+, pythonUDF1#115) AS endpoint#51, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, coalesce(cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int), 0) AS content_size#29, pythonUDF1#115 AS randomstr#40]
:  +- BatchEvalPython [CreateRandomID(), CreateRandomID()], [value#0, pythonUDF0#114, pythonUDF1#115]
:     +- Filter (NOT (regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) = ) && isnotnull(regexp_extract(value#0, (^\S+) , 1)))
:        +- Relation[value#0] text
+- Filter isnotnull(BadIPs#62)
   +- Relation[BadIPs#62] csv

== Physical Plan ==
*(3) BroadcastHashJoin [ip#2], [BadIPs#62], Inner, BuildRight
:- *(3) Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_replace(regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1), ^/~[^/]+, pythonUDF1#115) AS endpoint#51, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, coalesce(cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int), 0) AS content_size#29, pythonUDF1#115 AS randomstr#40]
:  +- BatchEvalPython [CreateRandomID(), CreateRandomID()], [value#0, pythonUDF0#114, pythonUDF1#115]
:     +- *(1) Filter (NOT (regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) = ) && isnotnull(regexp_extract(value#0, (^\S+) , 1)))
:        +- *(1) FileScan text [value#0] Batched: false, Format: Text, Location: InMemoryFileIndex[file:/data/logs/2000.01.01.txt, file:/data/logs/2000.01.02.txt, file:/data/logs..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
   +- *(2) Project [BadIPs#62]
      +- *(2) Filter isnotnull(BadIPs#62)
         +- *(2) FileScan csv [BadIPs#62] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/data/logs/ip_blacklist.csv], PartitionFilters: [], PushedFilters: [IsNotNull(BadIPs)], ReadSchema: struct<BadIPs:string>
{"startupTime": 1.684901475906372, "jobTime": 706.9621119499207}
