Skip to content

Commit 3acb34a

Browse files
authored
Merge pull request #4 from hatamiarash7/input-array
Support Input Array
2 parents fa40c6d + 1be96c1 commit 3acb34a

21 files changed

+287
-64
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ test/python/__pycache__/
88
.Rhistory
99
*.log
1010
*.csv
11-
!test/data/tranco.csv
11+
!test/data/*.csv

README.md

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -239,20 +239,20 @@ You can use this function to get the ranking of a domain:
239239

240240
```sql
241241
D SELECT get_tranco_rank('microsoft.com') as rank;
242-
┌───────┐
243-
│ rank │
244-
int32
245-
├───────┤
246-
2
247-
└───────┘
242+
┌─────────
243+
rank
244+
varchar
245+
├─────────
246+
2
247+
└─────────
248248

249249
D SELECT get_tranco_rank('cloudflare.com') as rank;
250-
┌───────┐
251-
│ rank │
252-
int32
253-
├───────┤
254-
13
255-
└───────┘
250+
┌─────────
251+
rank
252+
varchar
253+
├─────────
254+
13
255+
└─────────
256256
```
257257

258258
### Get Extension Version

src/functions/extract_domain.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,23 @@ namespace duckdb
1111
{
1212
// Extract the input from the arguments
1313
auto &input_vector = args.data[0];
14-
auto input = input_vector.GetValue (0).ToString ();
14+
auto result_data = FlatVector::GetData<string_t> (result);
1515

16-
if (input.empty ())
16+
for (idx_t i = 0; i < args.size (); i++)
1717
{
18-
result.SetValue (0, Value (""));
19-
return;
20-
}
21-
22-
// Extract the domain using the utility function
23-
auto domain = netquack::ExtractDomain (state, input);
18+
auto input = input_vector.GetValue (i).ToString ();
2419

25-
result.SetValue (0, Value (domain));
20+
try
21+
{
22+
// Extract the domain using the utility function
23+
auto domain = netquack::ExtractDomain (state, input);
24+
result_data[i] = StringVector::AddString (result, domain);
25+
}
26+
catch (const std::exception &e)
27+
{
28+
result_data[i] = "Error extracting domain: " + std::string (e.what ());
29+
}
30+
}
2631
}
2732

2833
namespace netquack

src/functions/extract_host.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,23 @@ namespace duckdb
99
{
1010
// Extract the input from the arguments
1111
auto &input_vector = args.data[0];
12-
auto input = input_vector.GetValue (0).ToString ();
12+
auto result_data = FlatVector::GetData<string_t> (result);
1313

14-
// Extract the host using the utility function
15-
auto host = netquack::ExtractHost (input);
14+
for (idx_t i = 0; i < args.size (); i++)
15+
{
16+
auto input = input_vector.GetValue (i).ToString ();
1617

17-
// Set the result
18-
result.SetValue (0, Value (host));
18+
try
19+
{
20+
// Extract the host using the utility function
21+
auto host = netquack::ExtractHost (input);
22+
result_data[i] = StringVector::AddString (result, host);
23+
}
24+
catch (const std::exception &e)
25+
{
26+
result_data[i] = "Error extracting host: " + std::string (e.what ());
27+
}
28+
}
1929
}
2030

2131
namespace netquack

src/functions/extract_path.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,23 @@ namespace duckdb
99
{
1010
// Extract the input from the arguments
1111
auto &input_vector = args.data[0];
12-
auto input = input_vector.GetValue (0).ToString ();
12+
auto result_data = FlatVector::GetData<string_t> (result);
1313

14-
// Extract the path using the utility function
15-
auto path = netquack::ExtractPath (input);
14+
for (idx_t i = 0; i < args.size (); i++)
15+
{
16+
auto input = input_vector.GetValue (i).ToString ();
1617

17-
// Set the result
18-
result.SetValue (0, Value (path));
18+
try
19+
{
20+
// Extract the path using the utility function
21+
auto path = netquack::ExtractPath (input);
22+
result_data[i] = StringVector::AddString (result, path);
23+
}
24+
catch (const std::exception &e)
25+
{
26+
result_data[i] = "Error extracting path: " + std::string (e.what ());
27+
}
28+
};
1929
}
2030

2131
namespace netquack
@@ -26,11 +36,12 @@ namespace duckdb
2636
// Explanation:
2737
// ^ - Start of the string
2838
// (?: - Non-capturing group for the protocol and domain part
29-
// (?:(?:ftp|https?|rsync):\/\/)? - Optional ftp://, http://, https://, or rsync://
30-
// (?:[^\/\s]+) - Domain name (any characters except '/' or whitespace)
39+
// (?:(?:ftp|https?|rsync):\/\/)? - Optional protocol (ftp://, http://, https://, or rsync://)
40+
// (?:[^\/\s]+) - Domain name or IP address (any characters except '/' or whitespace)
3141
// )
32-
// (\/[^?#]*) - Capturing group for the path (starts with '/', followed by any characters except '?' or '#')
33-
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*))");
42+
// (\/[^?#]*)? - Optional capturing group for the path (starts with '/', followed by any characters except '?' or '#')
43+
// - The '?' at the end makes the path component optional, allowing the regex to match URLs with or without a path
44+
std::regex path_regex (R"(^(?:(?:(?:ftp|https?|rsync):\/\/)?(?:[^\/\s]+))(\/[^?#]*)?)");
3445
std::smatch path_match;
3546

3647
// Use regex_search to find the path component in the input string

src/functions/extract_query.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,25 @@ namespace duckdb
77
// Function to extract the query string from a URL
88
void ExtractQueryStringFunction (DataChunk &args, ExpressionState &state, Vector &result)
99
{
10-
// Extract the URL from the input
11-
auto &url_vector = args.data[0];
12-
auto url = url_vector.GetValue (0).ToString ();
10+
// Extract the input from the arguments
11+
auto &input_vector = args.data[0];
12+
auto result_data = FlatVector::GetData<string_t> (result);
1313

14-
// Extract the query string
15-
auto query_string = netquack::ExtractQueryString (url);
14+
for (idx_t i = 0; i < args.size (); i++)
15+
{
16+
auto input = input_vector.GetValue (i).ToString ();
1617

17-
// Set the result
18-
result.SetValue (0, Value (query_string));
18+
try
19+
{
20+
// Extract the query string using the utility function
21+
auto query_string = netquack::ExtractQueryString (input);
22+
result_data[i] = StringVector::AddString (result, query_string);
23+
}
24+
catch (const std::exception &e)
25+
{
26+
result_data[i] = "Error extracting query string: " + std::string (e.what ());
27+
}
28+
};
1929
}
2030

2131
namespace netquack

src/functions/extract_schema.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,23 @@ namespace duckdb
99
{
1010
// Extract the input from the arguments
1111
auto &input_vector = args.data[0];
12-
auto input = input_vector.GetValue (0).ToString ();
12+
auto result_data = FlatVector::GetData<string_t> (result);
1313

14-
// Extract the schema using the utility function
15-
auto schema = netquack::ExtractSchema (input);
14+
for (idx_t i = 0; i < args.size (); i++)
15+
{
16+
auto input = input_vector.GetValue (i).ToString ();
1617

17-
// Set the result
18-
result.SetValue (0, Value (schema));
18+
try
19+
{
20+
// Extract the schema using the utility function
21+
auto schema = netquack::ExtractSchema (input);
22+
result_data[i] = StringVector::AddString (result, schema);
23+
}
24+
catch (const std::exception &e)
25+
{
26+
result_data[i] = "Error extracting schema: " + std::string (e.what ());
27+
}
28+
};
1929
}
2030

2131
namespace netquack

src/functions/extract_subdomain.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,23 @@ namespace duckdb
1111
{
1212
// Extract the input from the arguments
1313
auto &input_vector = args.data[0];
14-
auto input = input_vector.GetValue (0).ToString ();
14+
auto result_data = FlatVector::GetData<string_t> (result);
1515

16-
// Extract the sub-domain using the utility function
17-
auto subdomain = netquack::ExtractSubDomain (state, input);
16+
for (idx_t i = 0; i < args.size (); i++)
17+
{
18+
auto input = input_vector.GetValue (i).ToString ();
1819

19-
result.SetValue (0, Value (subdomain));
20+
try
21+
{
22+
// Extract the subdomain using the utility function
23+
auto subdomain = netquack::ExtractSubDomain (state, input);
24+
result_data[i] = StringVector::AddString (result, subdomain);
25+
}
26+
catch (const std::exception &e)
27+
{
28+
result_data[i] = "Error extracting subdomain: " + std::string (e.what ());
29+
}
30+
}
2031
}
2132

2233
namespace netquack

src/functions/extract_tld.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,23 @@ namespace duckdb
1111
{
1212
// Extract the input from the arguments
1313
auto &input_vector = args.data[0];
14-
auto input = input_vector.GetValue (0).ToString ();
14+
auto result_data = FlatVector::GetData<string_t> (result);
1515

16-
// Extract the top-level domain using the utility function
17-
auto tld = netquack::ExtractTLD (state, input);
16+
for (idx_t i = 0; i < args.size (); i++)
17+
{
18+
auto input = input_vector.GetValue (i).ToString ();
1819

19-
// Set the result
20-
result.SetValue (0, Value (tld));
20+
try
21+
{
22+
// Extract the top-level domain using the utility function
23+
auto tld = netquack::ExtractTLD (state, input);
24+
result_data[i] = StringVector::AddString (result, tld);
25+
}
26+
catch (const std::exception &e)
27+
{
28+
result_data[i] = "Error extracting tld: " + std::string (e.what ());
29+
}
30+
}
2131
}
2232

2333
namespace netquack

src/functions/get_tranco.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,28 @@ namespace duckdb
148148
throw std::runtime_error ("Tranco table not found. Download it first using `SELECT update_tranco(true);`");
149149
}
150150

151-
auto &domain_vector = args.data[0];
152-
auto domain = domain_vector.GetValue (0).ToString ();
151+
// Extract the input from the arguments
152+
auto &input_vector = args.data[0];
153+
auto result_data = FlatVector::GetData<string_t> (result);
153154

154-
auto query = "SELECT rank FROM tranco_list WHERE domain = '" + domain + "'";
155-
auto query_result = con.Query (query);
155+
for (idx_t i = 0; i < args.size (); i++)
156+
{
157+
auto input = input_vector.GetValue (i).ToString ();
158+
159+
try
160+
{
161+
auto query = "SELECT rank FROM tranco_list WHERE domain = '" + input + "'";
162+
163+
auto query_result = con.Query (query);
164+
auto rank = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ();
156165

157-
result.SetValue (0, query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value ());
166+
result_data[i] = StringVector::AddString (result, rank.ToString ());
167+
}
168+
catch (const std::exception &e)
169+
{
170+
result_data[i] = "Error extracting tranco rank: " + std::string (e.what ());
171+
}
172+
}
158173
}
159174
} // namespace netquack
160175
} // namespace duckdb

src/netquack_extension.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ namespace duckdb
9595
auto get_tranco_rank_function = ScalarFunction (
9696
"get_tranco_rank",
9797
{ LogicalType::VARCHAR },
98-
LogicalType::INTEGER,
98+
LogicalType::VARCHAR,
9999
netquack::GetTrancoRankFunction);
100100
ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function);
101101

test/data/examples.csv

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
example.com
2+
example.com.ac
3+
example.com.co
4+
a.example.com
5+
example.com/a
6+
example.com.ac/a
7+
https://example.com
8+
https://a.example.com
9+
http://example.com.ac/path/?a=1&b=2&

test/data/examples_tranco.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
microsoft.com
2+
googleapis.com
3+
gstatic.com
4+
apple.com

test/sql/extract_domain.test

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
require netquack
66

7+
statement ok
8+
CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'});
9+
710
query I
811
SELECT extract_domain('example.com');
912
----
@@ -172,4 +175,17 @@ SELECT extract_domain('http:/example.com.ac/path');
172175
query I
173176
SELECT extract_domain('http:/example.com.ac:443/path');
174177
----
175-
(empty)
178+
(empty)
179+
180+
query I
181+
SELECT extract_domain(uri) from uri_list;
182+
----
183+
example.com
184+
example.com.ac
185+
example.com.co
186+
example.com
187+
example.com
188+
example.com.ac
189+
example.com
190+
example.com
191+
example.com.ac

test/sql/extract_host.test

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
require netquack
66

7+
statement ok
8+
CREATE TABLE uri_list AS SELECT * FROM read_csv('test/data/examples.csv', header=false, columns={'uri': 'VARCHAR'});
9+
710
query I
811
SELECT extract_host('http://example.com.ac/path');
912
----
@@ -63,3 +66,16 @@ query I
6366
SELECT extract_host('rsync://rpki.example.com/path');
6467
----
6568
rpki.example.com
69+
70+
query I
71+
SELECT extract_host(uri) from uri_list;
72+
----
73+
example.com
74+
example.com.ac
75+
example.com.co
76+
a.example.com
77+
example.com
78+
example.com.ac
79+
example.com
80+
a.example.com
81+
example.com.ac

0 commit comments

Comments
 (0)