This is an Amazon interview question for senior software development engineer from careercup. Here is the original thread,
"
Given an array of 1 billion numbers with just a few 100 missing, find all the missing numbers. you have only enough memory to store only 10k elements
".2. Data Structure and Algorithm
3 approaches are proposed in my C++ post, Data Structure and Algorithm - Detect Missing Numbers in a Large Set. The C++ implementation is there too.
3. Python Implementation
An abstract base class called "NumberStream" is implemented to act as the 1 billion numbers. A derived class called "NumberGenerator" is implemented to return the numbers with range of 1 billion with missing numbers.
A loop and recursive solution are implemented for Approach 2. A loop implementation is implemented for Approach 3. The test is based on a smaller scale, with 1 million number with 100 missing and 10000/100000/1000000 memory available
# *******************************************************************************
# Implementation
# *******************************************************************************
from abc import ABCMeta, abstractmethod
class NumberStream:
"""description of class"""
__metadata__ = ABCMeta
@abstractmethod
def HasNext(self):
pass
@abstractmethod
def GetNext(self):
pass
@abstractmethod
def ResetToHead(self):
pass
@abstractmethod
def GetMissingNumbers(self):
pass
@abstractmethod
def GetUpperBound(self):
pass
@abstractmethod
def GetMissing(self):
pass
.
from NumberStream import NumberStream
import random
class NumberGenerator(NumberStream):
"""description of class"""
def __init__(self, upperBound, missing):
#super(self, upperBound, missing)
self.m_UpperBound = upperBound
self.m_Missing = missing
self.m_MissingNumbers = set()
self.m_CurVal = 0;
self.m_MissingDetected = 0;
random.seed()
count = 0
# generate random numbers as the missing numbers
while count < missing:
temp = random.randint(1, upperBound)
if temp not in self.m_MissingNumbers:
self.m_MissingNumbers.add(temp)
count = count + 1
def GetNext(self):
self.m_CurVal = self.m_CurVal + 1
while self.m_CurVal in self.m_MissingNumbers:
self.m_CurVal = self.m_CurVal + 1
self.m_MissingDetected = self.m_MissingDetected + 1
return self.m_CurVal
def HasNext(self):
return self.m_CurVal < self.m_UpperBound and \
(self.m_UpperBound - self.m_CurVal) > (self.m_Missing - self.m_MissingDetected)
def ResetToHead(self):
self.m_CurVal = 0
self.m_MissingDetected = 0
def GetMissingNumbers(self):
return self.m_MissingNumbers
def GetUpperBound(self):
return self.m_UpperBound
def GetMissing(self):
return self.m_Missing
# Approach 2 and 3 implementation
from NumberStream import NumberStream
import sortedcontainers
import sys
class MissingNumberDetector(object):
"""description of class"""
def __init__(self):
pass
def __FindMissingNumber(self, numberStream, lowerBound, upperBound, memSize, results):
try:
#linear detect the missing number
hashMap = bytearray([00]*memSize*4)
numberStream.ResetToHead()
while numberStream.HasNext():
temp = numberStream.GetNext()
if temp >= lowerBound and temp <= upperBound:
idx = (temp - lowerBound)/8
bits = (temp - lowerBound) - idx*8
hashMap[idx] = hashMap[idx] | (1 << bits)
index = -1
number = lowerBound
for value in hashMap:
if number > upperBound:
break
index = index + 1
if value == 255:
number = number + 8;
continue;
for bit in range(0, 8):
flag = value & (1 << bit)
# each bit to represent if a number is missing (0 - missing)
if flag == 0:
results.add(lowerBound + index * 8 + bit)
number = number + 1;
if number > upperBound:
break;
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# Approach 2 recursive implementation
def DetectMissingNumber(self, numberStream, memSize = 1000):
try:
results = set()
if numberStream.GetMissing() == 0:
return results
searchQueue = []
searchQueue.append((1, numberStream.GetUpperBound()))
totalLinearDetectSize = 1000 * 32
while len(searchQueue) != 0:
(lowerBound, upperBound) = searchQueue.pop()
if (upperBound - lowerBound) <= totalLinearDetectSize:
# find missing numbers in small range (within mem size)
self.__FindMissingNumber(numberStream, lowerBound,\
upperBound, memSize, results)
else:
numberStream.ResetToHead();
mid = (lowerBound + upperBound)/2
countOf1stHalf = 0
countOf2ndHalf = 0
while numberStream.HasNext():
temp = numberStream.GetNext()
if temp <= mid and temp >= lowerBound:
countOf1stHalf = countOf1stHalf + 1
elif temp > mid and temp <= upperBound:
countOf2ndHalf = countOf2ndHalf + 1
if countOf1stHalf < (mid + 1 - lowerBound):
# push [lowerBound, mid]
searchQueue.append((lowerBound, mid))
if countOf2ndHalf < (upperBound - mid):
# push [mid + 1, upperBound]
searchQueue.append((mid + 1, upperBound))
return results
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# Approach 2 recursive internal implementation
def __DetectMissingNumber_R(self, numberStream, lowerBound,\
upperBound, memSize, results):
try:
if (upperBound - lowerBound) <= memSize*32:
self.__FindMissingNumber(numberStream, lowerBound,\
upperBound, memSize, results)
return
numberStream.ResetToHead();
mid = (lowerBound + upperBound)/2
countOf1stHalf = 0
countOf2ndHalf = 0
while numberStream.HasNext():
temp = numberStream.GetNext()
if temp <= mid and temp >= lowerBound:
countOf1stHalf = countOf1stHalf + 1
elif temp > mid and temp <= upperBound:
countOf2ndHalf = countOf2ndHalf + 1
if countOf1stHalf < (mid + 1 - lowerBound):
# sub problem [lowerBound, mid]
self.__DetectMissingNumber_R(numberStream, lowerBound,\
mid, memSize, results)
if countOf2ndHalf < (upperBound - mid):
# sub problem [mid + 1, upperBound]
self.__DetectMissingNumber_R(numberStream, mid + 1,
upperBound, memSize, results)
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# Approach 2 recursive implementation
def DetectMissingNumber_R(self, numberStream, memSize = 1000):
try:
results = set()
if numberStream.GetMissing() > 0:
self.__DetectMissingNumber_R(numberStream, 1, \
numberStream.GetUpperBound(), memSize, results)
return results
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# Approach 3 implementation - divide ranges
def __DivideRange(self, numberStream, lowerBound, upperBound, memSize, ranges):
try:
division = (upperBound + 1 - lowerBound) / (memSize << 5)
ladders = sortedcontainers.SortedDict()
ladders[lowerBound] = 0
for idx in range(1, division):
ladders[lowerBound + idx * (memSize << 5)] = 0
if division * (memSize << 5) < upperBound:
ladders[division * (memSize << 5) + lowerBound] = 0
numberStream.ResetToHead()
keys = ladders.keys();
while numberStream.HasNext():
temp = numberStream.GetNext()
if temp >= lowerBound and temp <= upperBound:
lBound = ladders.bisect_left(temp)
uBound = ladders.bisect_right(temp)
if lBound != uBound:
key = keys[lBound];
else:
key = keys[uBound - 1]
ladders[key] = ladders[key] + 1
for idx in range(0, len(keys) - 1) :
if (keys[idx + 1] - keys[idx]) > ladders[keys[idx]]:
ranges.append((keys[idx], keys[idx + 1] - 1))
if (upperBound - keys[len(keys) -1]) > ladders[keys[len(keys) - 1]]:
ranges.append((keys[len(keys) - 1], upperBound))
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except IndexError as e:
print ("IndexError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# Approach 3 implementation
def DetectMissingNumber_Div(self, numberStream, memSize):
try:
results = set()
if numberStream.GetMissing() == 0:
return results
searchQueue = []
searchQueue.append((1, numberStream.GetUpperBound()))
totalLinearDetectSize = memSize * 32
while len(searchQueue) != 0:
(lowerBound, upperBound) = searchQueue.pop()
if (upperBound - lowerBound) <= totalLinearDetectSize:
# find missing numbers in small range (within mem size)
self.__FindMissingNumber(numberStream, lowerBound,\
upperBound, memSize, results)
else:
# divide ranges
self.__DivideRange(numberStream, lowerBound,\
upperBound, memSize, searchQueue)
return results
except AttributeError as e:
print ("AttributeError: {0}".format(e))
except:
print ("Unexpected Error: ", sys.exc_info()[0])
# ********************************************************************************
# Unit Test
# ********************************************************************************
import unittest
from MissingNumberDetector import MissingNumberDetector
from NumberStream import NumberStream
from NumberGenerator import NumberGenerator
class Test_MissingNumberDetector(unittest.TestCase):
def test_MND(self):
ns = NumberGenerator(1000000, 100)
mbd = MissingNumberDetector()
results = mbd.DetectMissingNumber(ns, 1000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber(ns, 10000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber(ns, 100000)
self.assertEqual(results, ns.GetMissingNumbers())
def test_MND_R(self):
ns = NumberGenerator(1000000, 100)
mbd = MissingNumberDetector()
results = mbd.DetectMissingNumber_R(ns, 1000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber_R(ns, 10000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber_R(ns, 100000)
self.assertEqual(results, ns.GetMissingNumbers())
def test_MND_Div(self):
ns = NumberGenerator(1000000, 100)
mbd = MissingNumberDetector()
results = mbd.DetectMissingNumber_Div(ns, 1000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber_Div(ns, 10000)
self.assertEqual(results, ns.GetMissingNumbers())
results = mbd.DetectMissingNumber_Div(ns, 100000)
self.assertEqual(results, ns.GetMissingNumbers())
if __name__ == '__main__':
unittest.main()