@article { author = {Norouz, Hosseini and Asghari Moghaddam, Asghar and Nadiri, Attaallah}, title = {Determining vulnerable areas of Malekan Plain Aquifer for Nitrate, Using Random Forest method}, journal = {Journal of Environmental Studies}, volume = {41}, number = {4}, pages = {923-942}, year = {2016}, publisher = {دانشگاه تهران}, issn = {1025-8620}, eissn = {2345-6922}, doi = {10.22059/jes.2016.57144}, abstract = {Determining vulnerable areas of Malekan Plain Aquifer for Nitrate, Using Random Forest method Introduction:Management of groundwater, especially in dry regions such as Iran, is essential and this concern becomes further with development of agriculture, industry, population growth and climate changes, that affecting the quality and quantity of groundwater resources. Hence, groundwater contamination can treat the human health. Since groundwater moves slowly through the subsurface, the impact of anthropogenic activities may last for a relatively long time and for that reason, the environmental measures should be mainly focused on the prevention of the contamination. One of the ways to prevent of groundwater contamination is identifying vulnerable regions of aquifers and management of land use. The assessment of groundwater vulnerability maps requires the application of diverse methods and techniques, based on the hydrogeological knowledge of the region under research and on the application of predictive models. With the aim of deciding which areas are vulnerable a large data volume can be collected which cannot be effectively analyzed without an adequate and efficient model. Several methods have been devised to vulnerability mapping that relatively using fewer data and based on evidence of contamination. In this study to overcoming the problems of other methods the random forest (RF) algorithms is proposed.Materials and methods:Malekan plain is located in East Azarbaijan Province, Southeast of Urmia Lake, northwest of Iran, with 450 Km2. This region is one of the very active cultivated areas which its water demands supply by groundwater resources. In recent years groundwater quality of the area is encountered with degradation problem. Malekan region have different geological formations such as Lalon, Shemshak, Lar formations, and a large part of the area in the western part is an alluvial deposits of Quaternary. Aquifer of this Plain is unconfined, which mainly formed by old and recent alluvial terraces, alluvial fans and fluvial sediments. Based on drilling wells logs and geophysical data, the west part of the plain is made of fine grained material with low permeable. According to farming and existing of grape farms in this region and intensive use of fertilizers and manure the groundwater nitrate concentration of the aquifer is high (Figure 1).To evaluate the quality of groundwater resources, especially the assessment of nitrate anomalies in groundwater of the Malekan plain, 27 samples were collected from groundwater resources in September 2014, and Hydrochemical analysis were carried out in Hydrology Laboratory of Tabriz University. In this study the random forest (RF) algorithms, which is a learning method based on ensemble of decision trees, is proposed. The RF technique has advantages over other methods due to having, high prediction accuracy, ability to learn nonlinear relationships and ability to determine the important variables in the prediction. In this paper RF method is used to estimate the Malekan Aquifer vulnerability, with four sets of data, including A model with all variables, B model with variables related to characteristics of the aquifer, C model with driving forces variables, and D model with variables related to the DRASTIC method. The predictions derived from all possible parameter combinations were evaluated using the root mean square error (RMSE) and mean square error. The area under the curve statistic (AUC) was used to determine which models and which combination of dataset performed better. An AUC value of 1 is considered perfect. Fig1. Spatial distribution of nitrate concentrationResults and Discussions:From 23 explanatory variables used in model, five variables (depth to water table, hydraulic conductivity, distance to grape farms, hydraulic gradient and transmissivity) can describe the nitrates behavior in the Malekan plain aquifer with more accuracy, since a smaller MSE was obtained. In order to obtain continuous and standardized variables for all area of the study, all data were transformed into a raster format, and where were applied mainly three different approaches: 1) geostatistical techniques (e.g. hydraulic conductivity, hydraulic gradient and soil texture), 2) Euclidian distance raster calculations (potential point sources of contamination) and 3) classification of land cover from remotely sensed data and NDVI. In this paper RF method is used to estimate the Malekan Aquifer vulnerability, with four sets of data, including A model with all variables, B model with variables related to characteristics of the aquifer, C model with driving forces variables, and D model with variables related to the DRASTIC method. In order to set the value of k from which the error converges and which also makes estimation more reliable, models made up of 1000 trees were generated from all explanatory variables. The parameter was optimized by varying the number of split variables between 1 and the maximum number of variables of every subset. The resulting models were evaluated using the OOB error estimation. For the selection of the most accurate model the one in which the OOB error was the lowest is determined. Moreover, with the aim of reducing the dimensionality and improve the accuracy and interpretability of models, a FS strategy was adopted. The most significant predictive features were selected by using the importance measures of RF. The least significant explanatory variables of every subset were reduced until reaching the minimum error rate. Nitrate concentration was rescaled to a new response variable for every experimental sample: samples with nitrate concentrations higher or equal to the threshold value were given a value equal to 1 and samples lower to the threshold a value equal to 0. The explanative variables (predictors) and response variable were combined together into a set of input feature vectors. These vectors formed the input to the RF algorithm and are known as input-feature vectors. The binary response variable (nitrate pollution) was used as target values for the training of the algorithm. In this study, which four models were used to predict nitrate contamination of groundwater, as shown in Fig2, A and B Models, respectively with RMSE equal to 0/11157 and 0/12214, predicted approximately 44 and 42 percent of the region's in the high vulnerability that located In the central and eastern parts of the aquifer. However C and D models, respectively with RMSE equal to 0/1392 and 0/1597, predicted approximately 15 and 24 percent of the region's in the high vulnerability and could not be trusted in assessment of Groundwater vulnerability. Fig 2. Vulnerability Map of the four models. A) All variables, B) variables related to characteristics of the aquifer, C) driving forces variables, and D) variables related to the DRASTICKeywords: Groundwater, Malekan plain, Nitrate, Vulnerability, Random Forest}, keywords = {groundwater,Malekan Plain,Nitrate,vulnerability,Random forest}, title_fa = {تعیین مناطق آسیب‌پذیر آبخوان دشت ملکان به نیترات با استفاده از روش جنگل تصادفی}, abstract_fa = {به دلیل وجود آنومالی نیترات در آب زیرزمینی دشت ملکان، 27 نمونه از منابع آب زیرزمینی در شهریور سال 1393 جمع‌آوری و در آزمایشگاه آبشناسی دانشگاه تبریز تجزیة هیدروشیمیایی شد. در مطالعة حاضر روش جنگل تصادفی (RF)، که روشی یادگیری مبتنی بر دسته‌ای از درخت‌های تصمیم است، برای ارزیابی آسیب‌پذیری پیشنهاد شده است. روش RF نسبت به روش‌های دیگر دارای مزایایی مانند دقت پیش‌بینی بالا، توانایی زیاد در تعیین متغیرهای مهم در پیش‌بینی و ماهیت غیرپارامتری است. در این مقاله عملکرد روش RF برای مدل‌سازی پیش‌بینی آسیب‌پذیری ویژة آبخوان دشت ملکان با استفاده از چهار دسته از داده‌ها شامل مدل A با تمام متغیرها، مدل B با متغیرهای مربوط به خصوصیات آبخوان، مدل C با متغیرهای نیروهای محرک و مدل D با متغیرهای مربوط به روش دراستیک ارزیابی شد. مدل‌های A و B با کمترین MSE به ترتیب برابر 012/0 و 013/0 و بیشترین AUC به‌منزلة روش‌های مناسب برای آسیب‌پذیری آب زیرزمینی به آلودگی نیترات انتخاب شدند و مدل‌های C و D با داشتن بیشترین MSE به ترتیب برابر با 015/0 و 026/0 و کمترین AUC به‌منزلة روش‌های نامناسب شناخته شدند. مدل A که دقیق‌ترین مدل شناخته شد 44 درصد از منطقه را در محدودة آسیب‌پذیری زیاد شناسایی کرد.}, keywords_fa = {آب زیرزمینی,آسیب‌پذیری,جنگل تصادفی,دشت ملکان,نیترات}, url = {https://jes.ut.ac.ir/article_57144.html}, eprint = {https://jes.ut.ac.ir/article_57144_378e3a15291192ea6027984d51046ad5.pdf} }