Redshift setup not working


#1

I am trying to process my s3 logs into Redshift but i am getting error

Error

Loading Snowplow events and shredded types into sp (Redshift cluster)…
Unexpected error: Cannot find atomic-events directory in shredded/good
uri:classloader:/storage-loader/lib/snowplow-storage-loader/redshift_loader.rb:74:in load_events_and_shredded_types' uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:insend_to’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:in call_with' uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:inblock in redefine_method’
uri:classloader:/storage-loader/bin/snowplow-storage-loader:54:in block in (root)' uri:classloader:/storage-loader/bin/snowplow-storage-loader:51:in'
org/jruby/RubyKernel.java:973:in load' uri:classloader:/META-INF/main.rb:1:in'
org/jruby/RubyKernel.java:955:in require' uri:classloader:/META-INF/main.rb:1:in(root)‘
uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `’

my setup: Scala Stream Collector > Stream Enrich > Kinesis > S3 > storage loader > redshift

enricher.conf

enrich {
source = "kinesis"
sink = "kinesis"
aws {
access-key: "key"
secret-key: “key”
}

kafka {
brokers: “{{enrichKafkaBrokers}}”
}

streams {
in: {
raw: "good"
maxRecords: 10000
buffer: {
byte-limit: 4500000
record-limit: 500 # Not supported by Kafka; will be ignored
time-limit: 60000
}
}

out: {
enriched: "enriched"
bad: “bad”

backoffPolicy: {
minBackoff: 3000
maxBackoff: 600000
}
}

app-name: "enricher-app"
initial-position = "TRIM_HORIZON"
region: “us-west-2”
}

monitoring {
snowplow {
collector-uri: "xx.xx.xx.xx"
collector-port: 80
app-id: "collector-monitor"
method: “GET”
}
}
}

snowplow-kinesis-s3-0.4.0 config:

sink {
aws {
access-key: "key"
secret-key: “key”
}
kinesis {
in {
stream-name: "good"
initial-position: "TRIM_HORIZON"
max-records: “10000”
}
out {
stream-name: “bad”
}
region: "us-west-2"
app-name: “s3-sink-app”
}
s3 {
region: "us-west-2"
endpoint: "http://s3-us-west-2.s3.amazonaws.com"
bucket: "bucket-name/logs"
max-timeout: "300000"
format: “lzo”
}
buffer {
byte-limit: 4500000
record-limit: 500 # Not supported by Kafka; will be ignored
time-limit: 60000
}
logging {
level: “error”
}
}

Storage Loader config:

aws:

Credentials can be hardcoded or set in environment variables

access_key_id: XXXXXXXXXXXXX
secret_access_key: XXXXXXXXXXXXX

s3:
region: us-west-2
buckets:
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
log: s3://s3-bucket-log/slog/

  raw:
    in:
      - s3://s3-bucket-log/logs # e.g. s3://my-in-bucket
    processing: s3://s3-bucket-log/raw/processing
    archive: s3://s3-bucket-log/archive/raw # e.g. s3://my-archive-bucket/raw
  enriched:
    good: s3://s3-bucket-log/enrich/good       # e.g. s3://my-out-bucket/enriched/good
    bad: s3://s3-bucket-log/enrich/bad # e.g. s3://my-out-bucket/enriched/bad
    errors: s3://s3-bucket-log/enrich/errors # Leave blank unless :continue_on_unexpected_error: set to true below
    archive: s3://s3-bucket-log/enriched # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
  shredded:
    good: s3://s3-bucket-log/shredded/good # e.g. s3://my-out-bucket/shredded/good
    bad: s3://s3-bucket-log/shredded/bad # e.g. s3://my-out-bucket/shredded/bad
    errors: s3://s3-bucket-log/shredded/errors # Leave blank unless :continue_on_unexpected_error: set to true below
    archive: s3://s3-bucket-log/archive/shredded # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded

emr:
ami_version: 4.5.0
region: us-west-2 # Always set this
jobflow_role: EMR_EC2_DefaultRole # Created using aws emr create-default-roles service_role: EMR_DefaultRole # Created using aws emr create-default-roles
placement: # Set this if not running in VPC. Leave blank otherwise
ec2_subnet_id: XXXXXXXXXXX # Set this if running in VPC. Leave blank otherwise
ec2_key_name: XXXXXXXXX
bootstrap: [] # Set this to specify custom boostrap actions. Leave empty otherwise
software:
hbase: # Optional. To launch on cluster, provide version, “0.92.0”, keep quotes. Leave empty otherwise.
lingual: # Optional. To launch on cluster, provide version, “1.1”, keep quotes. Leave empty otherwise.
# Adjust your Hadoop cluster below
jobflow:
master_instance_type: m1.medium
core_instance_count: 2
core_instance_type: m1.medium
task_instance_count: 0 # Increase to use spot instances
task_instance_type: m1.medium
task_instance_bid: 0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures
additional_info: # Optional JSON string for selecting additional features
collectors:
format: thrift # For example: ‘clj-tomcat’ for the Clojure Collector, ‘thrift’ for Thrift records, ‘tsv/com.amazon.aws.cloudfront/wd_access_log’ for Cloudfront access logs or ‘ndjson/urbanairship.connect/v1’ for UrbanAirship Connect events
enrich:
job_name: snowplow-enrich # Give your job a name
versions:
hadoop_enrich: 1.8.0 # Version of the Hadoop Enrichment process
hadoop_shred: 0.9.0 # Version of the Hadoop Shredding process
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
continue_on_unexpected_error: false # Set to ‘true’ (and set :out_errors: above) if you don’t want any exceptions thrown from ETL
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
storage:
download:
folder: # Postgres-only config option. Where to store the downloaded files. Leave blank for Redshift
targets:
- name: "XXXX"
type: redshift
host: xxxxxxxxxxxxx.redshift.amazonaws.com # The endpoint as shown in the Redshift console
database: XXXXXXXXX
port: 5439
ssl_mode: disable
table: atomic.events
username: XXXXXXXX
password: XXXXXXXXX
maxerror: 10 # Stop loading on first error, or increase to permit more load errors
comprows: 200000 # Default for a 1 XL node cluster. Not used unless --include compupdate specified
monitoring:
logging:
level: DEBUG


#2

Maybe the path for shredded atomic events is wrong?

Unexpected error: Cannot find atomic-events directory in shredded/good

#3

Have a look at this thread here.

The data that ends up in Redshift is the result of the EMR process rather than a product of the data that’s been enriched by stream enrich. The EMR enrichment process includes shredding while the stream enrich does not at this point in time.


#4

@mike @ecoron thanks for the info, as i am trying to run EmrEtlRunner
i think my config is correct
i am getting the following error:

    Value guarded in: Snowplow::EmrEtlRunner::Cli::load_config
    With Contract: Maybe, String => Hash
    At: uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:134 ):
uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:45:in `block in Contract'
uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:154:in `failure_callback'
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:80:in `call_with'
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in `block in redefine_method'
uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:106:in `process_options'
uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:92:in `get_args_config_enrichments_resolver'
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:in `send_to'
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:in `call_with'
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in `block in redefine_method'
uri:classloader:/emr-etl-runner/bin/snowplow-emr-etl-runner:37:in `<main>'
org/jruby/RubyKernel.java:973:in `load'
uri:classloader:/META-INF/main.rb:1:in `<main>'
org/jruby/RubyKernel.java:955:in `require'
uri:classloader:/META-INF/main.rb:1:in `(root)'
uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `<main>'

#5

@mike is correct - at this point in time, to load Redshift you need to use a Lambda architecture with Snowplow:

http://discourse.snowplowanalytics.com/t/how-to-setup-a-lambda-architecture-for-snowplow/249


#6

@alex thanks for the info, as i am trying to run EmrEtlRunner
i think my config is correct
i am getting the following error:

Value guarded in: Snowplow::EmrEtlRunner::Cli::load_config
With Contract: Maybe, String => Hash
At: uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:134 ):

uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:45:in block in Contract' uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:154:infailure_callback’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:80:in call_with' uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:inblock in redefine_method’
uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:106:in process_options' uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:92:inget_args_config_enrichments_resolver’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:in send_to' uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:incall_with’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in block in redefine_method' uri:classloader:/emr-etl-runner/bin/snowplow-emr-etl-runner:37:in'
org/jruby/RubyKernel.java:973:in load' uri:classloader:/META-INF/main.rb:1:in'
org/jruby/RubyKernel.java:955:in require' uri:classloader:/META-INF/main.rb:1:in(root)‘
uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `’


#7

@alex
my configuration is exactly like “Lambda architecture for Snowplow”.

Collector > Kenises (RAW Stream) > S3 > EmrEtlRunner > Redshift
                                 > Stream Enrich > Elastic search

i have tried lots of different config.ymal files but i am still getting following error:

Value guarded in: Snowplow::EmrEtlRunner::Cli::load_config
With Contract: Maybe, String => Hash
At: uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:134 ):

uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:45:in block in Contract' uri:classloader:/gems/contracts-0.11.0/lib/contracts.rb:154:infailure_callback’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:80:in call_with' uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:inblock in redefine_method’
uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:106:in process_options' uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb:92:inget_args_config_enrichments_resolver’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:in send_to' uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:incall_with’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in block in redefine_method' uri:classloader:/emr-etl-runner/bin/snowplow-emr-etl-runner:37:in'
org/jruby/RubyKernel.java:973:in load' uri:classloader:/META-INF/main.rb:1:in'
org/jruby/RubyKernel.java:955:in require' uri:classloader:/META-INF/main.rb:1:in(root)‘
uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `’


#8

Hi @geetanshjindal, it’s worth posting your config.yml file for EmrEtlRunner here as it looks like a value in that configuration isn’t correct.


#9

@mike i am using same config for EmrEtlRunner and storageloader

Storage Loader config:

aws:

access_key_id: XXXXXXXXXXXXX
secret_access_key: XXXXXXXXXXXXX

s3:
region: us-west-2
buckets:
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
log: s3://s3-bucket-log/slog/

  raw:
    in:
      - s3://s3-bucket-log/logs # e.g. s3://my-in-bucket
    processing: s3://s3-bucket-log/raw/processing
    archive: s3://s3-bucket-log/archive/raw # e.g. s3://my-archive-bucket/raw
  enriched:
    good: s3://s3-bucket-log/enrich/good       # e.g. s3://my-out-bucket/enriched/good
    bad: s3://s3-bucket-log/enrich/bad # e.g. s3://my-out-bucket/enriched/bad
    errors: s3://s3-bucket-log/enrich/errors # Leave blank unless :continue_on_unexpected_error: set to true below
    archive: s3://s3-bucket-log/enriched # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
  shredded:
    good: s3://s3-bucket-log/shredded/good # e.g. s3://my-out-bucket/shredded/good
    bad: s3://s3-bucket-log/shredded/bad # e.g. s3://my-out-bucket/shredded/bad
    errors: s3://s3-bucket-log/shredded/errors # Leave blank unless :continue_on_unexpected_error: set to true below
    archive: s3://s3-bucket-log/archive/shredded # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded
emr:
ami_version: 4.5.0
region: us-west-2 # Always set this
jobflow_role: EMR_EC2_DefaultRole # Created using $ aws emr create-default-roles
service_role: EMR_DefaultRole # Created using $ aws emr create-default-roles
placement: # Set this if not running in VPC. Leave blank otherwise
ec2_subnet_id: XXXXXXXXXXX # Set this if running in VPC. Leave blank otherwise
ec2_key_name: XXXXXXXXX
bootstrap: [] # Set this to specify custom boostrap actions. Leave empty otherwise
software:
hbase: # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise.
lingual: # Optional. To launch on cluster, provide version, "1.1", keep quotes. Leave empty otherwise.
jobflow:
master_instance_type: m1.medium
core_instance_count: 2
core_instance_type: m1.medium
task_instance_count: 0 # Increase to use spot instances
task_instance_type: m1.medium
task_instance_bid: 0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures
additional_info: # Optional JSON string for selecting additional features
collectors:
format: thrift # For example: 'clj-tomcat' for the Clojure Collector, 'thrift' for Thrift records, 'tsv/com.amazon.aws.cloudfront/wd_access_log' for Cloudfront access logs or 'ndjson/urbanairship.connect/v1' for UrbanAirship Connect events
enrich:
job_name: snowplow-enrich # Give your job a name
versions:
hadoop_enrich: 1.8.0 # Version of the Hadoop Enrichment process
hadoop_shred: 0.9.0 # Version of the Hadoop Shredding process
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
continue_on_unexpected_error: false # Set to 'true' (and set :out_errors: above) if you don't want any exceptions thrown from ETL
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
storage:
download:
folder: # Postgres-only config option. Where to store the downloaded files. Leave blank for Redshift
targets:
- name: "XXXX"
type: redshift
host: xxxxxxxxxxxxx.redshift.amazonaws.com # The endpoint as shown in the Redshift console
database: XXXXXXXXX
port: 5439 
ssl_mode: disable 
table: atomic.events
username: XXXXXXXX
password: XXXXXXXXX
maxerror: 10 # Stop loading on first error, or increase to permit more load errors
comprows: 200000 # Default for a 1 XL node cluster. Not used unless --include compupdate specified
monitoring:
logging:
level: DEBUG

#10

@mike any update ?


#11

Can you repost or edit the existing config with the indentation of the .yml file corrected?


#12

@mike

aws:
access_key_id: KEY
emr:
additional_info: ~
ami_version: "4.5.0"
bootstrap: []
bootstrap_failure_tries: 3
ec2_key_name: snowplow
ec2_subnet_id: subnet-0c120000
jobflow:
core_instance_count: 2
core_instance_ebs:
ebs_optimized: false
volume_iops: 400
volume_size: 100
volume_type: gp2
core_instance_type: m1.medium
master_instance_type: m1.medium
task_instance_bid: 0.015
task_instance_count: 0
task_instance_type: m1.medium
jobflow_role: EMR_EC2_DefaultRole
placement: ~
region: us-west-2
service_role: EMR_DefaultRole
software:
hbase: ~
lingual: ~
s3:
buckets:
assets: "s3://snowplow-hosted-assets"
enriched:
archive: "s3://snowplow-logs-xxx/enriched"
bad: "s3://snowplow-logs-xxx/enrich/bad"
errors: "s3://snowplow-logs-xxx/enrich/errors"
good: "s3://snowplow-logs-xxx/enrich/good"
jsonpath_assets: ~
log: "s3://snowplow-logs-xxx/slog/"
raw:
archive: "s3://snowplow-logs-xxx/archive/raw"
in:
- "s3://snowplow-logs-xxx/logs"
processing: "s3://snowplow-logs-xxx/raw/pro"
shredded:
archive: "s3://snowplow-logs-xxx/archive/shredded"
bad: "s3://snowplow-logs-xxx/shredded/bad"
errors: "s3://snowplow-logs-xxx/shredded/errors"
good: "s3://snowplow-logs-xxx/shredded/good"
region: us-west-2
secret_access_key: KEY
collectors:
format: thrift
enrich:
continue_on_unexpected_error: false
job_name: "Snowplow ETL"
output_compression: NONE
versions:
hadoop_elasticsearch: "0.1.0"
hadoop_enrich: "1.8.0"
hadoop_shred: "0.10.0"
monitoring:
logging:
level: DEBUG
snowplow:
app_id: emrapp
collector: sp.domain.com
method: get
tags: {}
storage:
download:
folder: ~
targets:
-
comprows: 200000
database: snowplow
host: sp.xxxxxxxxxxxxxxx.us-west-2.redshift.amazonaws.com
maxerror: 1
name: sp
password: xxxxxx
port: 5439
ssl_mode: disable
table: atomic.events
type: redshift
username: snowuser

and issue i am facing:

Loading Snowplow events and shredded types into sp (Redshift cluster)…
Unexpected error: Cannot find atomic-events directory in shredded/good
uri:classloader:/storage-loader/lib/snowplow-storage-loader/redshift_loader.rb:74:in load_events_and_shredded_types’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:insend_to’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:in call_with’
uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:inblock in redefine_method’
uri:classloader:/storage-loader/bin/snowplow-storage-loader:54:in block in (root)'
uri:classloader:/storage-loader/bin/snowplow-storage-loader:51:in’
org/jruby/RubyKernel.java:973:in load’
uri:classloader:/META-INF/main.rb:1:in’
org/jruby/RubyKernel.java:955:in require’
uri:classloader:/META-INF/main.rb:1:in(root)‘
uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `’


#13

Are you attempting to load data that’s been output from EMR to Redshift? This error comes up often if you are instead trying to load data that’s been generated from the Kinesis sink directly.


#14

@mike
i am running following command:

$ ./snowplow-emr-etl-runner --config valid.conf --resolver resolver.json 

and output i am getting is:

>       x snowplow-logs-xxx/logs/2017-03-09-49570785775441028786451005644987930971409103180051513346-49570785775441028786451005654573503795133617478186303490.lzo
> (t1)    MOVE snowplow-logs-xxx/logs/2017-03-09-49570785775441028786451005747183266206713542972977184770-49570785775441028786451005757034802710753288059053146114.lzo.index -> snowplow-logs-xxx/raw/pro/2017-03-09-49570785775441028786451005747183266206713542972977184770-49570785775441028786451005757034802710753288059053146114.lzo.us-west-2.logs.index
>       +-> snowplow-logs-xxx/raw/pro/2017-03-09-49570785775441028786451005717594806771645028003991257090-49570785775441028786451005727371389874868694874696187906.lzo.us-west-2.logs.index
>       +-> snowplow-logs-xxx/raw/pro/2017-03-09-49570785775441028786451005707650182979494894623933857794-49570785775441028786451005717593597845825413374816550914.lzo.us-west-2.logs.index
>       +-> snowplow-logs-xxx/raw/pro/2017-03-09-49570785775441028786451005717594806771645028003991257090-49570785775441028786451005727371389874868694874696187906.us-west-2.logs.lzo      +-> snowplow-logs-xxx/raw/pro/2017-03-09-49570785775441028786451005707650182979494894623933857794-49570785775441028786451005717593597845825413374816550914.us-west-2.logs.lzo

      x snowplow-logs-xxx/logs/2017-03-09-49570785775441028786451006382586214515429979268875026434-49570785775441028786451006382747001649438724949110947842.lzo.index
D, [2017-03-09T08:05:37.456000 #14981] DEBUG -- : Waiting a minute to allow S3 to settle (eventual consistency)
D, [2017-03-09T08:06:37.466000 #14981] DEBUG -- : Initializing EMR jobflow
D, [2017-03-09T08:06:40.935000 #14981] DEBUG -- : EMR jobflow j-11OBFI31X33VQ started, waiting for jobflow to complete...
I, [2017-03-09T08:06:40.947000 #14981]  INFO -- : SnowplowTracker::Emitter initialized with endpoint http://sp.domain.com:80/i
I, [2017-03-09T08:06:41.513000 #14981]  INFO -- : Attempting to send 1 request
I, [2017-03-09T08:06:41.520000 #14981]  INFO -- : Sending GET request to http://sp.domain.com:80/i...
I, [2017-03-09T08:06:41.612000 #14981]  INFO -- : GET request to http://sp.domain.com:80/i finished with status code 200
I, [2017-03-09T08:10:43.826000 #14981]  INFO -- : Attempting to send 1 request
I, [2017-03-09T08:10:43.830000 #14981]  INFO -- : Sending GET request to http://sp.domain.com:80/i...
I, [2017-03-09T08:10:43.952000 #14981]  INFO -- : GET request to http://sp.domain.com:80/i finished with status code 200
F, [2017-03-09T08:10:44.537000 #14981] FATAL -- :

Snowplow::EmrEtlRunner::EmrExecutionError (EMR jobflow j-11OBFI31X33VQ failed, check Amazon EMR console and Hadoop logs for details (help: https://github.com/snowplow/snowplow/wiki/Troubleshooting-jobs-on-Elastic-MapReduce). Data files not archived.
Snowplow ETL: TERMINATED_WITH_ERRORS [VALIDATION_ERROR] ~ elapsed time n/a [ - 2017-03-09 08:09:13 UTC]
 - 1. Elasticity S3DistCp Step: Raw S3 Staging -> S3 Archive: CANCELLED ~ elapsed time n/a [ - ]
 - 2. Elasticity S3DistCp Step: Shredded HDFS -> S3: CANCELLED ~ elapsed time n/a [ - ]
 - 3. Elasticity Scalding Step: Shred Enriched Events: CANCELLED ~ elapsed time n/a [ - ]
 - 4. Elasticity S3DistCp Step: Enriched HDFS _SUCCESS -> S3: CANCELLED ~ elapsed time n/a [ - ]
 - 5. Elasticity S3DistCp Step: Enriched HDFS -> S3: CANCELLED ~ elapsed time n/a [ - ]
 - 6. Elasticity Scalding Step: Enrich Raw Events: CANCELLED ~ elapsed time n/a [ - ]
 - 7. Elasticity S3DistCp Step: Raw S3 -> HDFS: CANCELLED ~ elapsed time n/a [ - ]):
    uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb:500:in `run'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:in `send_to'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:in `call_with'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in `block in redefine_method'
    uri:classloader:/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb:69:in `run'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_reference.rb:43:in `send_to'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/call_with.rb:76:in `call_with'
    uri:classloader:/gems/contracts-0.11.0/lib/contracts/method_handler.rb:138:in `block in redefine_method'
    uri:classloader:/emr-etl-runner/bin/snowplow-emr-etl-runner:39:in `<main>'
    org/jruby/RubyKernel.java:973:in `load'
    uri:classloader:/META-INF/main.rb:1:in `<main>'
    org/jruby/RubyKernel.java:955:in `require'
    uri:classloader:/META-INF/main.rb:1:in `(root)'
    uri:classloader:/META-INF/jruby.home/lib/ruby/stdlib/rubygems/core_ext/kernel_require.rb:1:in `<main>'

#15

@mike @alex anything you can suggest ?
i have tried everything, but still facing the same issue


#16

Hi @geetanshjindal - a VALIDATION_ERROR normally means that EMR couldn’t spin up a cluster using the specified EC2 instance types.

If you check in the EMR UI, you should see a line of text that details the specific validation error - can you share that error text with us?


#17

Hi Alex,

you were right. Error showing on EMR page is “EMR_DefaultRole is invalid”


#18

To resolve this have a look at the AWS knowledge base article here and then attempt to rerun the EMR job with the default roles.


#19

@mike @alex

Thanks for the suggestion.
I am getting new error now. I am trying to fix this on my own also but let me now if you have a fix for this.

Thanks

Excon::Error::Socket (Unsupported record version Unknown-0.0 (OpenSSL::SSL::SSLError)):
org/jruby/ext/openssl/SSLSocket.java:222:in connect_nonblock' uri:classloader:/gems/excon-0.52.0/lib/excon/ssl_socket.rb:121:ininitialize’
uri:classloader:/gems/excon-0.52.0/lib/excon/connection.rb:403:in socket' uri:classloader:/gems/excon-0.52.0/lib/excon/connection.rb:100:inrequest_call’
uri:classloader:/gems/excon-0.52.0/lib/excon/middlewares/mock.rb:48:in request_call' uri:classloader:/gems/excon-0.52.0/lib/excon/middlewares/instrumentor.rb:26:inrequest_call’
uri:classloader:/gems/excon-0.52.0/lib/excon/middlewares/base.rb:16:in request_call' uri:classloader:/gems/excon-0.52.0/lib/excon/middlewares/base.rb:16:inrequest_call’
uri:classloader:/gems/excon-0.52.0/lib/excon/middlewares/base.rb:16:in request_call' uri:classloader:/gems/excon-0.52.0/lib/excon/connection.rb:249:inrequest’
uri:classloader:/gems/fog-xml-0.1.2/lib/fog/xml/sax_parser_connection.rb:35:in request' uri:classloader:/gems/fog-xml-0.1.2/lib/fog/xml/connection.rb:7:inrequest’
uri:classloader:/gems/fog-1.25.0/lib/fog/aws/storage.rb:521:in _request' uri:classloader:/gems/fog-1.25.0/lib/fog/aws/storage.rb:516:inrequest’
uri:classloader:/gems/fog-1.25.0/lib/fog/aws/requests/storage/copy_object.rb:32:in copy_object' uri:classloader:/gems/fog-1.25.0/lib/fog/aws/models/storage/file.rb:92:incopy’
uri:classloader:/gems/sluice-0.4.0/lib/sluice/storage/s3/s3.rb:622:in block in retry_x' org/jruby/ext/timeout/Timeout.java:117:intimeout’
uri:classloader:/gems/sluice-0.4.0/lib/sluice/storage/s3/s3.rb:621:in retry_x' uri:classloader:/gems/sluice-0.4.0/lib/sluice/storage/s3/s3.rb:548:inblock in process_files’
org/jruby/RubyKernel.java:1295:in loop' uri:classloader:/gems/sluice-0.4.0/lib/sluice/storage/s3/s3.rb:412:inblock in process_files’


#20

did not found anything yet, let me now if there is any thing you can suggest
@mike @alex