Search

EMR on EKS

kubectl create ns ws-emr
Shell
복사
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) OIDC_PROVIDER=$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.identity.oidc.issuer" --output text | cut -c 9-100)
Shell
복사
cat << EOF > kms-policy.json { "Version": "2012-10-17", "Statement": [ { "Sid": "VisualEditor0", "Effect": "Allow", "Action": "kms:*", "Resource": "*" } ] } EOF
Shell
복사
cat << EOF > emr-trust-policy.json { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": "emr-containers.amazonaws.com" }, "Action": "sts:AssumeRole" }, { "Effect": "Allow", "Principal": { "Federated": "arn:aws:iam::${ACCOUNT_ID}:oidc-provider/${OIDC_PROVIDER}" }, "Action": "sts:AssumeRoleWithWebIdentity", "Condition": { "StringLike": { "${OIDC_PROVIDER}:sub": "system:serviceaccount:ws-emr:emr-containers-sa-*" } } } ] } EOF
Shell
복사
aws iam create-policy \ --policy-name ws-kms-policy \ --policy-document file://kms-policy.json
Shell
복사
aws iam create-role \ --role-name ws-job-execution-role \ --assume-role-policy-document file://emr-trust-policy.json
Shell
복사
aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/CloudWatchLogsFullAccess --role-name ws-job-execution-role aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name ws-job-execution-role aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonRDSFullAccess --role-name ws-job-execution-role aws iam attach-role-policy --policy-arn arn:aws:iam::${ACCOUNT_ID}:policy/ws-kms-policy --role-name ws-job-execution-role
Shell
복사
cat > ws-job-execution-role.yaml << EOF apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: emr-containers namespace: ws-emr rules: - apiGroups: [""] resources: ["namespaces"] verbs: ["get"] - apiGroups: [""] resources: ["serviceaccounts", "services", "configmaps", "events", "pods", "pods/log"] verbs: ["get", "list", "watch", "describe", "create", "edit", "delete", "deletecollection", "annotate", "patch", "label"] - apiGroups: [""] resources: ["secrets"] verbs: ["create", "patch", "delete", "watch"] - apiGroups: ["apps"] resources: ["statefulsets", "deployments"] verbs: ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] - apiGroups: ["batch"] resources: ["jobs"] verbs: ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] - apiGroups: ["extensions"] resources: ["ingresses"] verbs: ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] - apiGroups: ["rbac.authorization.k8s.io"] resources: ["roles", "rolebindings"] verbs: ["get", "list", "watch", "describe", "create", "edit", "delete", "deletecollection", "annotate", "patch", "label"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: emr-containers namespace: ws-emr subjects: - kind: ServiceAccount name: emr-sa namespace: ws-emr roleRef: kind: Role name: emr-containers apiGroup: rbac.authorization.k8s.io EOF
Shell
복사
kubectl apply -f ws-job-execution-role.yaml
Shell
복사
aws iam create-policy \ --policy-name ws-emr-policy \ --policy-document '{ "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "rds:*", "secretsmanager:*", "s3:*", "kms:*", "logs:*", "ecr:*" ], "Resource": "*" } ] }'
Shell
복사
eksctl create iamserviceaccount \ --name emr-sa\ --namespace ws-emr \ --cluster ${CLUSTER_NAME} \ --role-name ws-emr-role \ --attach-policy-arn arn:aws:iam::${AWS_ACCOUNT_ID}:policy/ws-emr-policy \ --region ${AWS_DEFAULT_REGION} \ --approve
Shell
복사
aws emr-containers create-virtual-cluster \ --name ws-virtual-cluster \ --container-provider '{ "id": "ws-eks-cluster", "type": "EKS", "info": { "eksInfo": { "namespace": "ws-emr" } } }' \ --region ap-northeast-2
Shell
복사
export AWS_DEFAULT_REGION="ap-northeast-2" export ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) export VIRTUAL_CLUSTER_ID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name=='ws-virtual-cluster' && state=='RUNNING'].id" --output text --region $AWS_DEFAULT_REGION) export JOB_EXECUTION_ROLE_ARN=$(aws iam get-role --role-name ws-job-execution-role --query Role.Arn --output text) export SPARK_IMAGE_URI=$ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/ws-emr/spark:latest export S3_BUCKET=ws-cc-raw-data-$ACCOUNT_ID export RDS_HOST=$(aws rds describe-db-instances --db-instance-identifier ws-db-instance --query "DBInstances[0].Endpoint.Address" --output text) export RDS_PORT="5433" export RDS_DATABASE="fraud_detection" export RDS_USER="wsadmin" export RDS_PASSWORD="wspassword123" TOLERATIONS='[{\"key\":\"eks.amazonaws.com/compute-type\",\"operator\":\"Equal\",\"value\":\"emr\",\"effect\":\"NoSchedule\"}]'
Shell
복사
cat > submit-spark-job.sh << 'EOF' #!/bin/bash export AWS_DEFAULT_REGION="ap-northeast-2" export ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) export VIRTUAL_CLUSTER_ID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name=='ws-virtual-cluster' && state=='RUNNING'].id" --output text --region $AWS_DEFAULT_REGION) export JOB_EXECUTION_ROLE_ARN=$(aws iam get-role --role-name ws-job-execution-role --query Role.Arn --output text) export SPARK_IMAGE_URI=$ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/ws-emr/spark:latest export S3_BUCKET=ws-cc-raw-data-$ACCOUNT_ID export RDS_HOST=$(aws rds describe-db-instances --db-instance-identifier ws-db-instance --query "DBInstances[0].Endpoint.Address" --output text) export RDS_PORT="5433" export RDS_DATABASE="fraud_detection" export RDS_USER="wsadmin" export RDS_PASSWORD="wspassword123" TOLERATIONS='[{\"key\":\"eks.amazonaws.com/compute-type\",\"operator\":\"Equal\",\"value\":\"emr\",\"effect\":\"NoSchedule\"}]' # 작업 제출 aws emr-containers start-job-run \ --virtual-cluster-id $VIRTUAL_CLUSTER_ID \ --name credit-card-fraud-detection \ --execution-role-arn $JOB_EXECUTION_ROLE_ARN \ --release-label emr-6.9.0-latest \ --job-driver "{\"sparkSubmitJobDriver\": {\"entryPoint\": \"s3://${S3_BUCKET}/credit_card_analysis.py\", \"sparkSubmitParameters\": \"--conf spark.executor.instances=2 --conf spark.driver.memory=4G --conf spark.driver.cores=2 --conf spark.kubernetes.tolerations=${TOLERATIONS} --conf spark.kubernetes.driver.podTemplateFile=s3://${S3_BUCKET}/templates/driver-template.yaml --conf spark.kubernetes.executor.podTemplateFile=s3://${S3_BUCKET}/templates/executor-template.yaml --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.shuffleTracking.enabled=true --conf spark.hadoop.fs.s3.customAWSCredentialsProvider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain\"}}" \ --configuration-overrides "{\"applicationConfiguration\": [{\"classification\": \"spark-defaults\", \"properties\": {\"spark.kubernetes.container.image\": \"${SPARK_IMAGE_URI}\", \"spark.hadoop.fs.s3a.impl\": \"org.apache.hadoop.fs.s3a.S3AFileSystem\", \"spark.hadoop.fs.s3a.aws.credentials.provider\": \"com.amazonaws.auth.DefaultAWSCredentialsProviderChain\"}}, {\"classification\": \"spark-env\", \"properties\": {}, \"configurations\": [{\"classification\":\"export\", \"properties\": {\"RDS_PASSWORD\": \"${RDS_PASSWORD}\",\"RDS_HOST\": \"${RDS_HOST}\"}}]}], \"monitoringConfiguration\": {\"cloudWatchMonitoringConfiguration\": {\"logGroupName\": \"/aws/emr-containers/ws-virtual-cluster\", \"logStreamNamePrefix\": \"credit-card-fraud\"}, \"s3MonitoringConfiguration\": {\"logUri\": \"s3://${S3_BUCKET}/logs/\"}}}" \ --region $AWS_DEFAULT_REGION EOF
Shell
복사
chmod +x submit-spark-job.sh ./submit-spark-job.sh
Shell
복사