#include "Parameters.h"
#include <algorithm>
#include <intrin.h>
#include <future>
#include <array>

namespace Solution
{
	struct SubSolution
	{
		unsigned short max_value;
		unsigned short min_value;
		int max_diff;
	};

	SubSolution SolveSub( const Input& input,size_t iStart,size_t length )
	{
		const __m128i bias = _mm_set1_epi16( 0x8000u );
		const __m128i ones = _mm_set1_epi16( 0xFFFFu );
		const __m128i max_shorts_biased = _mm_set1_epi16( 0x7FFFu );
		const __m128i broadcast_low_short = _mm_set1_epi16( 0x0100u );
		__m128i min_so_far_biased = max_shorts_biased;
		__m128i max_diff_so_far_comp = ones;
		__m128i max_so_far_comp = ones;

		const auto pEnd = reinterpret_cast<const __m128i*>(input.data() + iStart) + length / 8;
		for( auto p = reinterpret_cast<const __m128i*>(input.data() + iStart); p < pEnd; p++ )
		{
			if( max_diff_so_far_comp.m128i_u16[0] == 0x0000u )
			{
				return { 0,0,0xFFFF };
			}
			const auto chunk = _mm_loadu_si128( p );
			const auto chunk_biased = _mm_add_epi16( chunk,bias );
			const auto comp = _mm_cmplt_epi16( chunk_biased,min_so_far_biased );
			if( _mm_testz_si128( comp,ones ) )
			{
				const auto chunk_comp = _mm_sub_epi16( ones,chunk );
				const auto max_chunk_comp_vi = _mm_minpos_epu16( chunk_comp );
				max_so_far_comp = _mm_min_epu16( max_chunk_comp_vi,max_so_far_comp );
				const auto min_so_far_comp = _mm_sub_epi16( bias,min_so_far_biased );
				const auto diff_v = _mm_sub_epi16( max_chunk_comp_vi,min_so_far_comp );
				max_diff_so_far_comp = _mm_min_epu16( max_diff_so_far_comp,diff_v );
			}
			else
			{
				auto min_so_far = unsigned short(_mm_cvtsi128_si32( min_so_far_biased ) + 0x8000);
				auto max_diff = unsigned short(0xFFFF - _mm_cvtsi128_si32( max_diff_so_far_comp ));
				auto max_so_far = unsigned short(0xFFFF - _mm_cvtsi128_si32( max_so_far_comp ));
				const auto pEnd = reinterpret_cast<const unsigned short*>(p) + 8;
				for( auto p = pEnd - 8; p < pEnd; p++ )
				{
					if( *p > min_so_far )
					{
						max_diff = std::max( max_diff,unsigned short(*p - min_so_far) );
						max_so_far = std::max( max_so_far,*p );
					}
					min_so_far = std::min( min_so_far,*p );
				}
				min_so_far_biased = _mm_cvtsi32_si128( int(min_so_far - 0x8000) );
				min_so_far_biased = _mm_shuffle_epi8( min_so_far_biased,broadcast_low_short );
				max_diff_so_far_comp = _mm_cvtsi32_si128( int(0xFFFF - max_diff) );
				max_so_far_comp = _mm_cvtsi32_si128( int(0xFFFF - max_so_far) );
			}
		}

		SubSolution ss;
		const auto max_diff = (unsigned short)(0xFFFF - _mm_cvtsi128_si32( max_diff_so_far_comp ));
		ss.max_diff = max_diff == 0 ? -1 : max_diff;
		ss.min_value = unsigned short(_mm_cvtsi128_si32( min_so_far_biased ) + 0x8000);
		ss.max_value = unsigned short(0xFFFF - _mm_cvtsi128_si32( max_so_far_comp ));
		return ss;
	}

	template<typename I>
	void AllocateUnits( I begin,I end,size_t units )
	{
		constexpr size_t chunk_size = 8u;
		const auto nChunks = units / chunk_size;
		const auto nGroups = end - begin;
		const auto overflow = nChunks % nGroups;
		const auto groupSize = nChunks / nGroups;
		for( size_t c = 0; begin != end; ++begin,c++ )
		{
			*begin = (groupSize + (c < overflow ? 1 : 0)) * chunk_size;
		}
	}

	Output Solve( const Input& input )
	{
		if( input.size() < 200000 )
		{
			return SolveSub( input,0,input.size() ).max_diff;
		}

		constexpr size_t nThreads = 4;
		std::array<size_t,nThreads> workloads;
		AllocateUnits( workloads.begin(),workloads.end(),input.size() );
		std::array<std::future<SubSolution>,nThreads> futures;
		for( size_t i = 0, offset = 0; i < nThreads; offset += workloads[i],i++ )
		{
			futures[i] = std::async( std::launch::async,SolveSub,std::cref( input ),offset,workloads[i] );
		}
		std::array<SubSolution,nThreads> ss;
		std::transform( futures.begin(),futures.end(),ss.begin(),[]( auto& f ){return f.get();} );
		auto min = ss[0].min_value;
		auto max_diff = ss[0].max_diff;
		for( int i = 1; i < nThreads; i++ )
		{
			if( min < ss[i].max_value )
			{
				max_diff = std::max( max_diff,int(ss[i].max_value - min) );
			}
			max_diff = std::max( max_diff,ss[i].max_diff );
			min = std::min( ss[i].min_value,min );
		}
		return max_diff;
	}
}