AWS_REGION=ap-northeast-2
S3_BUCKET=s3://finance-storage-01
EMR_VIRTUAL_CLUSTER_ID=pj35xafcvyhopbvdi8w5k57k1
EMR_EXECUTION_ROLE_ARN=arn:aws:iam::362708816803:role/EMRContainers-JobExecutionRole
CLOUDWATCH_LOG_GROUP=/emr-on-eks/finance-eks-cluster
JOB_NAME='taxidata'
EMR_EKS_RELEASE_LABEL="emr-6.10.0-latest"
SCRIPTS_S3_PATH="${S3_BUCKET}/scripts"
INPUT_DATA_S3_PATH="${S3_BUCKET}/input"
OUTPUT_DATA_S3_PATH="${S3_BUCKET}/output"
aws emr-containers start-job-run \
--virtual-cluster-id $EMR_VIRTUAL_CLUSTER_ID \
--name $JOB_NAME \
--region $AWS_REGION \
--execution-role-arn $EMR_EXECUTION_ROLE_ARN \
--release-label $EMR_EKS_RELEASE_LABEL \
--job-driver '{
"sparkSubmitJobDriver": {
"entryPoint": "'"$SCRIPTS_S3_PATH"'/pyspark-taxi-trip.py",
"entryPointArguments": ["'"$INPUT_DATA_S3_PATH"'",
"'"$OUTPUT_DATA_S3_PATH"'"
],
"sparkSubmitParameters": "--conf spark.executor.instances=2"
}
}' \
--configuration-overrides '{
"applicationConfiguration": [
{
"classification": "spark-defaults",
"properties": {
"spark.driver.cores":"1",
"spark.executor.cores":"1",
"spark.driver.memory": "4g",
"spark.executor.memory": "4g",
"spark.kubernetes.driver.podTemplateFile":"'"$SCRIPTS_S3_PATH"'/driver-pod-template.yaml",
"spark.kubernetes.executor.podTemplateFile":"'"$SCRIPTS_S3_PATH"'/executor-pod-template.yaml",
"spark.local.dir":"/data1",
"spark.kubernetes.submission.connectionTimeout": "60000000",
"spark.kubernetes.submission.requestTimeout": "60000000",
"spark.kubernetes.driver.connectionTimeout": "60000000",
"spark.kubernetes.driver.requestTimeout": "60000000",
"spark.kubernetes.executor.podNamePrefix":"'"$JOB_NAME"'"
}
}
],
"monitoringConfiguration": {
"persistentAppUI":"ENABLED",
"cloudWatchMonitoringConfiguration": {
"logGroupName":"'"$CLOUDWATCH_LOG_GROUP"'",
"logStreamNamePrefix":"'"$JOB_NAME"'"
},
"s3MonitoringConfiguration": {
"logUri":"'"${S3_BUCKET}/logs/"'"
}
}
}'
Shell
복사

