Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 271d288

Browse files
authored
Subquery Unnesting: Exists + In Support (#259)
- Support uncorrelated/correlated IN (ANY) and EXISTS clauses (This should effectively make subquery unnesting feature complete!) - TPC-H Q4, Q16, Q20, and Q22 working - The remaining queries mostly seem to have plans that are too bad (Q18 has some other issue, [fix in the pipeline](https://github.com/cmu-db/optd/pull/261) but it still seems like the plan is too slow).
1 parent cca20d4 commit 271d288

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1306
-183
lines changed

optd-datafusion-bridge/src/from_optd.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,18 @@ impl OptdPlanContext<'_> {
195195
Some(else_expr),
196196
)?)
197197
}
198+
FuncType::Not => {
199+
let expr = args[0].clone();
200+
Ok(physical_expr::expressions::not(expr)?)
201+
}
202+
FuncType::IsNull => {
203+
let expr = args[0].clone();
204+
Ok(physical_expr::expressions::is_null(expr)?)
205+
}
206+
FuncType::IsNotNull => {
207+
let expr = args[0].clone();
208+
Ok(physical_expr::expressions::is_not_null(expr)?)
209+
}
198210
_ => unreachable!(),
199211
}
200212
}
@@ -464,14 +476,21 @@ impl OptdPlanContext<'_> {
464476
let physical_expr =
465477
self.conv_from_optd_expr(node.cond(), &Arc::new(filter_schema.clone()))?;
466478

467-
if node.join_type() == JoinType::Cross {
479+
if *node.join_type() == JoinType::Cross {
468480
return Ok(Arc::new(CrossJoinExec::new(left_exec, right_exec))
469481
as Arc<dyn ExecutionPlan + 'static>);
470482
}
471483

472484
let join_type = match node.join_type() {
473-
JoinType::Inner => datafusion::logical_expr::JoinType::Inner,
474-
JoinType::LeftOuter => datafusion::logical_expr::JoinType::Left,
485+
JoinType::Inner => datafusion_expr::JoinType::Inner,
486+
JoinType::FullOuter => datafusion_expr::JoinType::Full,
487+
JoinType::LeftOuter => datafusion_expr::JoinType::Left,
488+
JoinType::RightOuter => datafusion_expr::JoinType::Right,
489+
JoinType::LeftSemi => datafusion_expr::JoinType::LeftSemi,
490+
JoinType::RightSemi => datafusion_expr::JoinType::RightSemi,
491+
JoinType::LeftAnti => datafusion_expr::JoinType::LeftAnti,
492+
JoinType::RightAnti => datafusion_expr::JoinType::RightAnti,
493+
JoinType::LeftMark => datafusion_expr::JoinType::LeftMark,
475494
_ => unimplemented!(),
476495
};
477496

optd-datafusion-bridge/src/into_optd.rs

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
// Use of this source code is governed by an MIT-style license that can be found in the LICENSE file or at
44
// https://opensource.org/licenses/MIT.
55

6+
use std::sync::Arc;
7+
68
use anyhow::{bail, Result};
79
use datafusion::common::DFSchema;
810
use datafusion::logical_expr::{self, logical_plan, LogicalPlan, Operator};
@@ -15,7 +17,7 @@ use optd_datafusion_repr::plan_nodes::{
1517
ConstantPred, DfReprPlanNode, DfReprPredNode, ExternColumnRefPred, FuncPred, FuncType,
1618
InListPred, JoinType, LikePred, ListPred, LogOpPred, LogOpType, LogicalAgg,
1719
LogicalEmptyRelation, LogicalFilter, LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan,
18-
LogicalSort, RawDependentJoin, SortOrderPred, SortOrderType,
20+
LogicalSort, RawDependentJoin, SortOrderPred, SortOrderType, SubqueryType,
1921
};
2022
use optd_datafusion_repr::properties::schema::Schema as OptdSchema;
2123

@@ -24,15 +26,18 @@ use crate::OptdPlanContext;
2426
impl OptdPlanContext<'_> {
2527
fn subqueries_to_dependent_joins(
2628
&mut self,
27-
subqueries: &[&Subquery],
29+
subqueries: Vec<(&Subquery, SubqueryType)>,
2830
input: ArcDfPlanNode,
2931
input_schema: &DFSchema,
3032
) -> Result<ArcDfPlanNode> {
3133
let mut node = input;
32-
for Subquery {
33-
subquery,
34-
outer_ref_columns,
35-
} in subqueries.iter()
34+
for (
35+
Subquery {
36+
subquery,
37+
outer_ref_columns,
38+
},
39+
sq_typ,
40+
) in subqueries.into_iter()
3641
{
3742
let subquery_root = self.conv_into_optd_plan_node(subquery, Some(input_schema))?;
3843
let dep_join = RawDependentJoin::new(
@@ -56,7 +61,7 @@ impl OptdPlanContext<'_> {
5661
})
5762
.collect(),
5863
),
59-
JoinType::Cross,
64+
sq_typ,
6065
);
6166
node = dep_join.into_plan_node();
6267
}
@@ -92,7 +97,7 @@ impl OptdPlanContext<'_> {
9297
expr: &'a logical_expr::Expr,
9398
context: &DFSchema,
9499
dep_ctx: Option<&DFSchema>,
95-
subqueries: &mut Vec<&'a Subquery>,
100+
subqueries: &mut Vec<(&'a Subquery, SubqueryType)>,
96101
) -> Result<ArcDfPredNode> {
97102
use logical_expr::Expr;
98103
match expr {
@@ -257,6 +262,18 @@ impl OptdPlanContext<'_> {
257262
)
258263
.into_pred_node())
259264
}
265+
Expr::Not(x) => {
266+
let expr = self.conv_into_optd_expr(x.as_ref(), context, dep_ctx, subqueries)?;
267+
Ok(FuncPred::new(FuncType::Not, ListPred::new(vec![expr])).into_pred_node())
268+
}
269+
Expr::IsNull(x) => {
270+
let expr = self.conv_into_optd_expr(x.as_ref(), context, dep_ctx, subqueries)?;
271+
Ok(FuncPred::new(FuncType::IsNull, ListPred::new(vec![expr])).into_pred_node())
272+
}
273+
Expr::IsNotNull(x) => {
274+
let expr = self.conv_into_optd_expr(x.as_ref(), context, dep_ctx, subqueries)?;
275+
Ok(FuncPred::new(FuncType::IsNotNull, ListPred::new(vec![expr])).into_pred_node())
276+
}
260277
Expr::Between(x) => {
261278
let expr =
262279
self.conv_into_optd_expr(x.expr.as_ref(), context, dep_ctx, subqueries)?;
@@ -288,9 +305,53 @@ impl OptdPlanContext<'_> {
288305
// This relies on a left-deep tree of dependent joins being
289306
// generated below this node, in response to all pushed subqueries.
290307
let new_column_ref_idx = context.fields().len() + subqueries.len();
291-
subqueries.push(sq);
308+
subqueries.push((sq, SubqueryType::Scalar));
292309
Ok(ColumnRefPred::new(new_column_ref_idx).into_pred_node())
293310
}
311+
Expr::Exists(ex) => {
312+
let sq = &ex.subquery;
313+
let negated = ex.negated;
314+
315+
let new_column_ref_idx = context.fields().len() + subqueries.len();
316+
subqueries.push((sq, SubqueryType::Exists));
317+
if negated {
318+
Ok(FuncPred::new(
319+
FuncType::Not,
320+
ListPred::new(
321+
vec![ColumnRefPred::new(new_column_ref_idx).into_pred_node()],
322+
),
323+
)
324+
.into_pred_node())
325+
} else {
326+
Ok(ColumnRefPred::new(new_column_ref_idx).into_pred_node())
327+
}
328+
}
329+
Expr::InSubquery(insq) => {
330+
let sq = &insq.subquery;
331+
let expr =
332+
self.conv_into_optd_expr(insq.expr.as_ref(), context, dep_ctx, subqueries)?;
333+
let negated = insq.negated;
334+
335+
let new_column_ref_idx = context.fields().len() + subqueries.len();
336+
subqueries.push((
337+
sq,
338+
SubqueryType::Any {
339+
pred: Arc::unwrap_or_clone(expr),
340+
op: BinOpType::Eq,
341+
},
342+
));
343+
if negated {
344+
Ok(FuncPred::new(
345+
FuncType::Not,
346+
ListPred::new(
347+
vec![ColumnRefPred::new(new_column_ref_idx).into_pred_node()],
348+
),
349+
)
350+
.into_pred_node())
351+
} else {
352+
Ok(ColumnRefPred::new(new_column_ref_idx).into_pred_node())
353+
}
354+
}
294355
_ => bail!("Unsupported expression: {:?}", expr),
295356
}
296357
}
@@ -308,7 +369,7 @@ impl OptdPlanContext<'_> {
308369
dep_ctx,
309370
&mut subqueries,
310371
)?;
311-
let input = self.subqueries_to_dependent_joins(&subqueries, input, node.input.schema())?;
372+
let input = self.subqueries_to_dependent_joins(subqueries, input, node.input.schema())?;
312373
Ok(LogicalProjection::new(input, expr_list))
313374
}
314375

@@ -326,7 +387,7 @@ impl OptdPlanContext<'_> {
326387
&mut subqueries,
327388
)?;
328389

329-
let input = self.subqueries_to_dependent_joins(&subqueries, input, node.input.schema())?;
390+
let input = self.subqueries_to_dependent_joins(subqueries, input, node.input.schema())?;
330391

331392
Ok(LogicalFilter::new(input, expr))
332393
}
@@ -336,7 +397,7 @@ impl OptdPlanContext<'_> {
336397
exprs: &'a [logical_expr::Expr],
337398
context: &DFSchema,
338399
dep_ctx: Option<&DFSchema>,
339-
subqueries: &mut Vec<&'a Subquery>,
400+
subqueries: &mut Vec<(&'a Subquery, SubqueryType)>,
340401
) -> Result<ListPred> {
341402
let exprs = exprs
342403
.iter()
@@ -350,7 +411,7 @@ impl OptdPlanContext<'_> {
350411
exprs: &'a [logical_expr::SortExpr],
351412
context: &DFSchema,
352413
dep_ctx: Option<&DFSchema>,
353-
subqueries: &mut Vec<&'a Subquery>,
414+
subqueries: &mut Vec<(&'a Subquery, SubqueryType)>,
354415
) -> Result<ListPred> {
355416
let exprs = exprs
356417
.iter()
@@ -453,7 +514,7 @@ impl OptdPlanContext<'_> {
453514
DFJoinType::RightAnti => JoinType::RightAnti,
454515
DFJoinType::LeftSemi => JoinType::LeftSemi,
455516
DFJoinType::RightSemi => JoinType::RightSemi,
456-
_ => unimplemented!(),
517+
DFJoinType::LeftMark => JoinType::LeftMark,
457518
};
458519
let mut log_ops = Vec::with_capacity(node.on.len());
459520
let mut subqueries = vec![];

optd-datafusion-repr-adv-cost/src/adv_stats/filter.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ impl<
6666
) -> f64 {
6767
match &expr_tree.typ {
6868
DfPredType::Constant(_) => Self::get_constant_selectivity(expr_tree),
69-
DfPredType::ColumnRef => unimplemented!("check bool type or else panic"),
69+
DfPredType::ColumnRef => {
70+
// TODO: Check that field is of bool type
71+
0.5 // TODO: placeholder---how can we get the selectivity?
72+
}
7073
DfPredType::UnOp(un_op_typ) => {
7174
assert!(expr_tree.children.len() == 1);
7275
let child = expr_tree.child(0);
@@ -104,7 +107,10 @@ impl<
104107
DfPredType::LogOp(log_op_typ) => {
105108
self.get_log_op_selectivity(*log_op_typ, &expr_tree.children, schema, column_refs)
106109
}
107-
DfPredType::Func(_) => unimplemented!("check bool type or else panic"),
110+
DfPredType::Func(_) => {
111+
// TODO: Check that field is of bool type
112+
0.5 // TODO: placeholder---how can we get the selectivity?
113+
}
108114
DfPredType::SortOrder(_) => {
109115
panic!("the selectivity of sort order expressions is undefined")
110116
}

optd-datafusion-repr-adv-cost/src/adv_stats/join.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ impl<
198198
);
199199
join_filter_selectivity
200200
}
201+
// TODO: Does this make sense?
202+
JoinType::LeftMark => f64::max(inner_join_selectivity, 1.0 / right_row_cnt),
201203
_ => unimplemented!("join_typ={} is not implemented", join_typ),
202204
}
203205
}
@@ -359,7 +361,11 @@ impl<
359361
&self,
360362
base_col_refs: HashSet<BaseTableColumnRef>,
361363
) -> f64 {
362-
assert!(base_col_refs.len() > 1);
364+
// Hack to avoid issue w/ self joins...unsure if this is a good idea
365+
if base_col_refs.len() <= 1 {
366+
return 1.0;
367+
}
368+
363369
let num_base_col_refs = base_col_refs.len();
364370
base_col_refs
365371
.into_iter()

optd-datafusion-repr/src/explain.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ pub fn explain_plan_node(
7575
DfNodeType::RawDepJoin(_) => RawDependentJoin::from_plan_node(node)
7676
.unwrap()
7777
.explain(meta_map),
78-
DfNodeType::DepJoin(_) => DependentJoin::from_plan_node(node)
78+
DfNodeType::DepJoin => DependentJoin::from_plan_node(node)
7979
.unwrap()
8080
.explain(meta_map),
8181
DfNodeType::Scan => LogicalScan::from_plan_node(node).unwrap().explain(meta_map),

optd-datafusion-repr/src/memo_ext.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ fn enumerate_join_order_expr_inner<M: Memo<DfNodeType> + ?Sized>(
4141
visited: &mut HashMap<GroupId, Vec<LogicalJoinOrder>>,
4242
) -> Vec<LogicalJoinOrder> {
4343
let expr = memo.get_expr_memoed(current);
44-
match expr.typ {
44+
match &expr.typ {
4545
DfNodeType::Scan => {
4646
let table = memo.get_pred(expr.predicates[0]); // TODO: use unified repr
4747
let table = ConstantPred::from_pred_node(table)
@@ -50,7 +50,7 @@ fn enumerate_join_order_expr_inner<M: Memo<DfNodeType> + ?Sized>(
5050
.as_str();
5151
vec![LogicalJoinOrder::Table(table)]
5252
}
53-
DfNodeType::Join(_) | DfNodeType::DepJoin(_) | DfNodeType::RawDepJoin(_) => {
53+
DfNodeType::Join(_) | DfNodeType::DepJoin | DfNodeType::RawDepJoin(_) => {
5454
// Assume child 0 == left, child 1 == right
5555
let left = expr.children[0];
5656
let right = expr.children[1];

optd-datafusion-repr/src/plan_nodes.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use pretty_xmlish::{Pretty, PrettyConfig};
3939
pub use projection::{LogicalProjection, PhysicalProjection};
4040
pub use scan::{LogicalScan, PhysicalScan};
4141
pub use sort::{LogicalSort, PhysicalSort};
42-
pub use subquery::{DependentJoin, RawDependentJoin}; // Add missing import
42+
pub use subquery::{DependentJoin, RawDependentJoin, SubqueryType};
4343

4444
use crate::explain::{explain_plan_node, explain_pred_node};
4545

@@ -69,16 +69,16 @@ impl std::fmt::Display for DfPredType {
6969

7070
/// DfNodeType FAQ:
7171
/// - The define_plan_node!() macro defines what the children of each join node are
72-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
72+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
7373
pub enum DfNodeType {
7474
// Developers: update `is_logical` function after adding new plan nodes
7575
// Plan nodes
7676
Projection,
7777
Filter,
7878
Scan,
7979
Join(JoinType),
80-
RawDepJoin(JoinType),
81-
DepJoin(JoinType),
80+
RawDepJoin(SubqueryType),
81+
DepJoin,
8282
Sort,
8383
Agg,
8484
EmptyRelation,

optd-datafusion-repr/src/plan_nodes/join.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ pub enum JoinType {
2020
RightSemi,
2121
LeftAnti,
2222
RightAnti,
23+
LeftMark,
2324
}
2425

2526
impl Display for JoinType {

optd-datafusion-repr/src/plan_nodes/macros.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ macro_rules! define_plan_node {
1818

1919
fn from_plan_node(plan_node: ArcDfPlanNode) -> Option<Self> {
2020
#[allow(unused_variables)]
21-
if let DfNodeType :: $variant $( ($inner_name) )? = plan_node.typ {
21+
if let DfNodeType :: $variant $( ($inner_name) )? = &plan_node.typ {
2222
Some(Self(plan_node))
2323
} else {
2424
None
@@ -105,9 +105,9 @@ macro_rules! define_plan_node {
105105
)*
106106

107107
$(
108-
pub fn $inner_name(&self) -> JoinType {
109-
if let DfNodeType :: $variant ($inner_name) = self.0 .typ {
110-
return $inner_name;
108+
pub fn $inner_name(&self) -> &$inner_typ {
109+
if let DfNodeType :: $variant ($inner_name) = &self.0.typ {
110+
return &$inner_name;
111111
} else {
112112
unreachable!();
113113
}

optd-datafusion-repr/src/plan_nodes/predicates/func_pred.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ pub enum FuncType {
1515
Scalar(String, DataType),
1616
Agg(String),
1717
Case,
18+
Not,
19+
IsNull,
20+
IsNotNull,
1821
}
1922

2023
impl std::fmt::Display for FuncType {

0 commit comments

Comments
 (0)