databrickslabs
diff --git a/‎dbldatagen/data_generator.py
Lines changed: 17 additions & 20 deletions b/‎dbldatagen/data_generator.py
Lines changed: 17 additions & 20 deletions
diff --git a/‎dbldatagen/spark_singleton.py
Lines changed: 16 additions & 7 deletions b/‎dbldatagen/spark_singleton.py
Lines changed: 16 additions & 7 deletions
@@ -10,10 +10,9 @@
 import re
 
 from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType
-
+from .spark_singleton import SparkSingleton
 from .column_generation_spec import ColumnGenerationSpec
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME
-from .spark_singleton import SparkSingleton
 from .utils import ensure, topologicalSort, DataGenError, deprecated
 
 _OLD_MIN_OPTION = 'min'
@@ -31,7 +30,7 @@ class DataGenerator:
     :param rows: = amount of rows to generate
     :param startingId: = starting value for generated seed column
     :param randomSeed: = seed for random number generator
-    :param partitions: = number of partitions to generate
+    :param partitions: = number of partitions to generate, if not provided, uses `spark.sparkContext.defaultParallelism`
     :param verbose: = if `True`, generate verbose output
     :param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
     :param debug: = if set to True, output debug level of information
@@ -65,7 +64,18 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
         self._rowCount = rows
         self.starting_id = startingId
         self.__schema__ = None
-        self.partitions = partitions if partitions is not None else 10
+
+        if sparkSession is None:
+            sparkSession = SparkSingleton.getLocalInstance()
+
+        self.sparkSession = sparkSession
+
+        # if the active Spark session is stopped, you may end up with a valid SparkSession object but the underlying
+        # SparkContext will be invalid
+        assert sparkSession is not None, "Spark session not initialized"
+        assert sparkSession.sparkContext is not None, "Expecting spark session to have valid sparkContext"
+
+        self.partitions = partitions if partitions is not None else sparkSession.sparkContext.defaultParallelism
 
         # check for old versions of args
         if "starting_id" in kwargs:
@@ -121,20 +131,6 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
         self.withColumn(ColumnGenerationSpec.SEED_COLUMN, LongType(), nullable=False, implicit=True, omit=True)
         self._batchSize = batchSize
 
-        if sparkSession is None:
-            sparkSession = SparkSingleton.getInstance()
-
-        assert sparkSession is not None, "The spark session attribute must be initialized"
-
-        self.sparkSession = sparkSession
-        if sparkSession is None:
-            raise DataGenError("""Spark session not initialized
-
-            The spark session attribute must be initialized in the DataGenerator initialization
-
-            i.e DataGenerator(sparkSession=spark, name="test", ...)
-            """)
-
         # set up use of pandas udfs
         self._setupPandas(batchSize)
 
@@ -257,7 +253,7 @@ def explain(self, suppressOutput=False):
 
         output = ["", "Data generation plan", "====================",
                   f"spec=DateGenerator(name={self.name}, rows={self._rowCount}, startingId={self.starting_id}, partitions={self.partitions})"
-                  , ")", "", f"column build order: {self._buildOrder}", "", "build plan:"]
+            , ")", "", f"column build order: {self._buildOrder}", "", "build plan:"]
 
         for plan_action in self._buildPlan:
             output.append(" ==> " + plan_action)
@@ -780,7 +776,8 @@ def _getBaseDataFrame(self, startId=0, streaming=False, options=None):
                 df1 = df1.withColumnRenamed("id", ColumnGenerationSpec.SEED_COLUMN)
 
         else:
-            status = (f"Generating streaming data frame with ids from {startId} to {end_id} with {id_partitions} partitions")
+            status = (
+                f"Generating streaming data frame with ids from {startId} to {end_id} with {id_partitions} partitions")
             self.logger.info(status)
             self.executionHistory.append(status)
 
 
@@ -10,7 +10,6 @@
 """
 
 import os
-import math
 import logging
 from pyspark.sql import SparkSession
 
@@ -28,17 +27,27 @@ def getInstance(cls):
         return SparkSession.builder.getOrCreate()
 
     @classmethod
-    def getLocalInstance(cls, appName="new Spark session"):
+    def getLocalInstance(cls, appName="new Spark session", useAllCores=True):
         """Create a machine local Spark instance for Datalib.
-        It uses 3/4 of the available cores for the spark session.
+        By default, it uses `n-1` cores  of the available cores for the spark session,
+        where `n` is total cores available.
 
+        :param useAllCores:  If `useAllCores` is True, then use all cores rather than `n-1` cores
         :returns: A Spark instance
         """
-        cpu_count = int(math.floor(os.cpu_count() * 0.75))
-        logging.info("cpu count: %d", cpu_count)
+        cpu_count = os.cpu_count()
 
-        return SparkSession.builder \
-            .master(f"local[{cpu_count}]") \
+        if useAllCores:
+            spark_core_count = cpu_count
+        else:
+            spark_core_count = cpu_count - 1
+
+        logging.info("Spark core count: %d", spark_core_count)
+
+        sparkSession = SparkSession.builder \
+            .master(f"local[{spark_core_count}]") \
             .appName(appName) \
             .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
             .getOrCreate()
+
+        return sparkSession