PySpark startup time: 1.654123306274414
Pyspark job time: 226.94821953773499 s
== Parsed Logical Plan ==
Join Inner, (ip#2 = BadIPs#39)
:- Filter NOT (endpoint#7 = )
:  +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, coalesce(content_size#10, cast(0 as int)) AS content_size#29]
:     +- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int) AS content_size#10]
:        +- Relation[value#0] text
+- Relation[BadIPs#39] csv

== Analyzed Logical Plan ==
ip: string, client_id: string, user_id: string, date: string, method: string, endpoint: string, protocol: string, response_code: int, content_size: int, BadIPs: string
Join Inner, (ip#2 = BadIPs#39)
:- Filter NOT (endpoint#7 = )
:  +- Project [ip#2, client_id#3, user_id#4, date#5, method#6, endpoint#7, protocol#8, response_code#9, coalesce(content_size#10, cast(0 as int)) AS content_size#29]
:     +- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int) AS content_size#10]
:        +- Relation[value#0] text
+- Relation[BadIPs#39] csv

== Optimized Logical Plan ==
Join Inner, (ip#2 = BadIPs#39)
:- Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, coalesce(cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int), 0) AS content_size#29]
:  +- Filter (NOT (regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) = ) && isnotnull(regexp_extract(value#0, (^\S+) , 1)))
:     +- Relation[value#0] text
+- Filter isnotnull(BadIPs#39)
   +- Relation[BadIPs#39] csv

== Physical Plan ==
*(2) BroadcastHashJoin [ip#2], [BadIPs#39], Inner, BuildRight
:- *(2) Project [regexp_extract(value#0, (^\S+) , 1) AS ip#2, regexp_extract(value#0, ^\S+ (\S+) , 1) AS client_id#3, regexp_extract(value#0, ^\S+ \S+ (\S+) , 1) AS user_id#4, regexp_extract(value#0, ^.*\[([\w:/]+\s[+\-]\d{4})\], 1) AS date#5, regexp_extract(value#0, ^.*"(\S+) \S+\s*\S*\s*", 1) AS method#6, regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) AS endpoint#7, regexp_extract(value#0, ^.*"\S+ \S+\s*(\S*)\s*", 1) AS protocol#8, cast(regexp_extract(value#0, ^.*" (\d{3}) , 1) as int) AS response_code#9, coalesce(cast(regexp_extract(value#0, ^.*" \d{3} (\S+), 1) as int), 0) AS content_size#29]
:  +- *(2) Filter (NOT (regexp_extract(value#0, ^.*"\S+ (\S+)\s*\S*\s*", 1) = ) && isnotnull(regexp_extract(value#0, (^\S+) , 1)))
:     +- *(2) FileScan text [value#0] Batched: false, Format: Text, Location: InMemoryFileIndex[file:/data/logs/2000.01.01.txt, file:/data/logs/2000.01.02.txt, file:/data/logs..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
   +- *(1) Project [BadIPs#39]
      +- *(1) Filter isnotnull(BadIPs#39)
         +- *(1) FileScan csv [BadIPs#39] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/data/logs/ip_blacklist.csv], PartitionFilters: [], PushedFilters: [IsNotNull(BadIPs)], ReadSchema: struct<BadIPs:string>
{"startupTime": 1.654123306274414, "jobTime": 226.94821953773499}
