diff --git a/linearmodels/panel/model.py b/linearmodels/panel/model.py index a572b09d24..612d5d07b0 100644 --- a/linearmodels/panel/model.py +++ b/linearmodels/panel/model.py @@ -11,6 +11,7 @@ from scipy.sparse import csc_matrix, diags from scipy.sparse.linalg import lsmr +from linearmodels.iv.model import IVGMM from linearmodels.panel.covariance import ( ACCovariance, ClusteredCovariance, @@ -3099,3 +3100,208 @@ def from_formula( mod = cls(dependent, exog, weights=weights, check_rank=check_rank) mod.formula = formula return mod + + +class ArellanoBond(_PanelModelBase): + """ + Arellano-Bond estimator model for lagged panel data. + + Parameters + ---------- + dependent : array_like + Dependent (left-hand-side) variable (time by entity) + exog : array_like + Exogenous or right-hand-side variables (variable by time by entity). + weights : array_like, optional + Weights to use in estimation. Assumes residual variance is + proportional to inverse of weight to that the residual time + the weight should be homoskedastic. + check_rank : bool, optional + Flag indicating whether to perform a rank check on the exogenous + variables to ensure that the model is identified. Skipping this + check can reduce the time required to validate a model specification. + Results may be numerically unstable if this check is skipped and + the matrix is not full rank. + lags : int, optional + The number of lags of the dependent series. Default value is 1 + system_gmm : bool, optional + Flag that if set to true we have additional instruments of lagged + differences. + + Notes + ----- + Fixed effects OLS will usually result in biased estimates of rho because of + the correlation between y(i,t) and e(i,t) stemming from u(i). + The Arrelano-Bond procedure first differences the observables and then uses + lags of y as instruments. + + The model is given by: + + .. math:: + + y_{it} = \rho * y_{it-1} + \beta*x_{it} + u_i + \epsilon_{it} + """ + + def __init__( + self, + dependent: PanelDataLike, + exog: PanelDataLike, + *, + weights: Optional[PanelDataLike] = None, + check_rank: Optional[bool] = True, + lags: Optional[int] = 1, + system_gmm: Optional[bool] = False, + weight_type: Optional[str] = "clustered", + iv_max_lags: Optional[int] = 1000, + ) -> None: + + super().__init__(dependent, exog, weights=weights, check_rank=check_rank) + + self.dependent = dependent + self.exog = exog + self.lags = lags + self.iv_max_lags = iv_max_lags + + self.starting_period = min(dependent.index.get_level_values(1)) + self.total_number_of_periods = ( + max(dependent.index.get_level_values(1)) + 1 - self.starting_period + ) + + self.y_var = dependent.name + self.diff_y_var = "diff_" + self.y_var + self.x_vars = exog.columns.tolist() + self.diff_x_vars = ["diff_" + x for x in self.x_vars] + + self.data = DataFrame() + self.data[self.y_var] = dependent + + self._calculate_diffs_and_lags() + + self._calculate_arellano_bond_instruments() + + if system_gmm: + self._calculate_system_gmm_instruments() + + dropped = self.data.dropna() + dropped["CLUSTER_VAR"] = dropped.index.get_level_values(0) + + self.model: IVGMM = IVGMM( + dependent=dropped[self.diff_y_var], + exog=dropped[self.diff_x_vars], + endog=dropped[self.lag_diff_y_vars], + instruments=dropped[self.instrument_vars], + weight_type=weight_type, + clusters=dropped["CLUSTER_VAR"] if weight_type == "clustered" else None, + ) + + def _calculate_diffs_and_lags(self) -> None: + """ + Generate and store the lags of the differenced endog and exog variables + """ + + self.data[self.diff_y_var] = self.dependent.groupby(level=0).diff() + + self.lag_y_vars = [] + self.lag_diff_y_vars = [] + for k in range(1, self.lags + 1): + col = f"lag_{k}_{self.y_var}" + col_diff = f"lag_{k}_{self.diff_y_var}" + + self.data[col] = self.data[self.y_var].shift(k) + self.lag_y_vars.append(col) + + self.data[col_diff] = self.data[self.diff_y_var].shift(k) + self.lag_diff_y_vars.append(col_diff) + + for x in self.x_vars: + self.data[x] = self.exog[x] + self.data["diff_" + x] = self.exog[x].groupby(level=0).diff() + + def _calculate_arellano_bond_instruments(self) -> None: + """ + Calculate the instruments for the Arrelano Bond procedure i.e. lags of the endog levels for different time periods + """ + self.instrument_vars = [] + for lag_number in range( + self.lags + 1, + min(self.total_number_of_periods, self.lags + 1 + self.iv_max_lags), + ): # TODO: Check this -- works for lags == 1, not sure if for anything else + for current_period in range(lag_number, self.total_number_of_periods): + + col = f"instrument_period_{current_period + self.starting_period}_lag_{lag_number}" + + data_pos = ( + self.dependent.index.get_level_values(1) + == current_period + self.starting_period + ) + instrument = Series(0, index=self.dependent.index) + instrument.loc[data_pos] = self.dependent.groupby(level=0).shift( + lag_number + )[data_pos] + self.data[col] = instrument + + self.instrument_vars.append(col) + + def _calculate_system_gmm_instruments(self) -> None: + """ + The systems GMM estimator adds additional instruments of lagged differences + """ + + system_gmm_instrument_vars = [] + for t in range( + self.lags, self.total_number_of_periods + ): # TODO: Check this -- works for lags == 1, not sure if for anything else + col = f"instrument_diff_period{t}_lag" + system_gmm_instrument_vars.append(col) + self.data[col] = ( + self.data[self.lag_diff_y_vars[0]].groupby(level=0).shift(self.lags) + ) + self.data.loc[ + self.dependent.index.get_level_values(1) != t + self.starting_period, + col, + ] = 0 + + # Then to estimate the system GMM we stack differenced and undifferenced data and their corresponding instruments + cols1 = ( + [self.diff_y_var] + + self.lag_diff_y_vars + + self.diff_x_vars + + self.instrument_vars + ) # variables used for the differences part of the regression + cols2 = ( + [self.y_var] + self.lag_y_vars + self.x_vars + system_gmm_instrument_vars + ) # variable used for the levels part of the regression + cols2R = ( + [self.diff_y_var] + + self.lag_diff_y_vars + + self.diff_x_vars + + system_gmm_instrument_vars + ) # used to rename the variables that we want to overlap in the system reg. + + # The differenced data set + data1 = self.data[cols1].copy() + for c in system_gmm_instrument_vars: + data1[c] = 0 # zero out the instruments that apply to undifferenced data + + # The undifferenced data set + data2 = self.data[cols2].copy() + data2.columns = cols2R + for c in self.instrument_vars: + data2[c] = 0 # zero out the instruments that apply to differenced data + + # Add an index level to each data series so we can join without creating duplicate index values + a1 = data1.index.get_level_values(0) + a2 = data1.index.get_level_values(1) + n = len(a1) + data1.index = MultiIndex.from_arrays([[1] * n, a1, a2]) + data2.index = MultiIndex.from_arrays([[2] * n, a1, a2]) + + # Now append the series together + self.data = data1.append(data2) + + def fit(self): + return self.model.fit() + + @property + def instruments(self): + return self.model.instruments