Saturday, November 12, 2016

Data Structure and Algorithm - Detect Missing Numbers in a Large Set

1. Problem Description
This is an Amazon interview question for senior software development engineer from careercup. Here is the original thread,
"
Given an array of 1 billion numbers with just a few 100 missing, find all the missing numbers. you have only enough memory to store only 10k elements
PS October 07, 2016 in United States Report Duplicate | Flag ".


2. Data Structure and Algorithm
3 approaches are proposed in my C++ post, Data Structure and Algorithm - Detect Missing Numbers in a Large Set. The C++ implementation is there too.

3. Python Implementation
An abstract base class called "NumberStream" is implemented to act as the 1 billion numbers. A derived class called "NumberGenerator" is implemented to return the numbers with range of 1 billion with missing numbers.

A loop and recursive solution are implemented for Approach 2. A loop implementation is implemented for Approach 3. The test is based on a smaller scale, with 1 million number with 100 missing and 10000/100000/1000000 memory available

# *******************************************************************************
# Implementation
# *******************************************************************************
from abc import ABCMeta, abstractmethod

class NumberStream:
    """description of class"""
    __metadata__ = ABCMeta

    @abstractmethod
    def HasNext(self):
        pass

    @abstractmethod
    def GetNext(self):
        pass

    @abstractmethod
    def ResetToHead(self):
        pass

    @abstractmethod
    def GetMissingNumbers(self):
        pass  

    @abstractmethod
    def GetUpperBound(self):
        pass

    @abstractmethod
    def GetMissing(self):
        pass
.
from NumberStream import NumberStream
import random

class NumberGenerator(NumberStream):
    """description of class"""

    def __init__(self, upperBound, missing):
        #super(self, upperBound, missing)
        self.m_UpperBound = upperBound
        self.m_Missing = missing
        self.m_MissingNumbers = set()
        self.m_CurVal = 0;
        self.m_MissingDetected = 0;
        random.seed()
        count = 0
        # generate random numbers as the missing numbers
        while count < missing:
            temp = random.randint(1, upperBound)
            if temp not in self.m_MissingNumbers:
                self.m_MissingNumbers.add(temp)
                count = count + 1

    def GetNext(self):
        self.m_CurVal = self.m_CurVal + 1
        while self.m_CurVal in self.m_MissingNumbers:
            self.m_CurVal = self.m_CurVal + 1
            self.m_MissingDetected = self.m_MissingDetected + 1
        return self.m_CurVal

    def HasNext(self):
        return self.m_CurVal < self.m_UpperBound and \
            (self.m_UpperBound - self.m_CurVal) > (self.m_Missing - self.m_MissingDetected)
 
    def ResetToHead(self):
        self.m_CurVal = 0
        self.m_MissingDetected = 0  

    def GetMissingNumbers(self):
        return self.m_MissingNumbers

    def GetUpperBound(self):
        return self.m_UpperBound

    def GetMissing(self):
        return self.m_Missing

# Approach 2 and 3 implementation
from NumberStream import NumberStream

import sortedcontainers
import sys

class MissingNumberDetector(object):
    """description of class"""

    def __init__(self):
        pass

    def __FindMissingNumber(self, numberStream, lowerBound, upperBound, memSize, results):
        try:
            #linear detect the missing number
            hashMap = bytearray([00]*memSize*4)
            numberStream.ResetToHead()
            while numberStream.HasNext():
                temp = numberStream.GetNext()
                if temp >= lowerBound and temp <= upperBound:
                    idx = (temp - lowerBound)/8
                    bits = (temp - lowerBound) - idx*8
                    hashMap[idx] = hashMap[idx] | (1 << bits)
            index = -1
            number = lowerBound
            for value in hashMap:
                if number > upperBound:
                    break
                index = index + 1
                if value == 255:
                    number = number + 8;
                    continue;
                for bit in range(0, 8):
                    flag = value & (1 << bit)
                    # each bit to represent if a number is missing (0 - missing)
                    if flag == 0:
                        results.add(lowerBound + index * 8 + bit)
                    number = number + 1;
                    if number > upperBound:
                        break;
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except:
            print ("Unexpected Error: ", sys.exc_info()[0])

    # Approach 2 recursive implementation
    def DetectMissingNumber(self, numberStream, memSize = 1000):
        try:
            results = set()
            if numberStream.GetMissing() == 0:
                return results
            searchQueue = []
            searchQueue.append((1, numberStream.GetUpperBound()))
            totalLinearDetectSize = 1000 * 32
            while len(searchQueue) != 0:
                (lowerBound, upperBound) = searchQueue.pop()
                if (upperBound - lowerBound) <= totalLinearDetectSize:
                    # find missing numbers in small range (within mem size)
                    self.__FindMissingNumber(numberStream, lowerBound,\
                                    upperBound, memSize, results)
                else:
                    numberStream.ResetToHead();
                    mid = (lowerBound + upperBound)/2
                    countOf1stHalf = 0
                    countOf2ndHalf = 0
                    while numberStream.HasNext():
                        temp = numberStream.GetNext()
                        if temp <= mid and temp >= lowerBound:
                            countOf1stHalf = countOf1stHalf + 1
                        elif temp > mid and temp <= upperBound:
                            countOf2ndHalf = countOf2ndHalf + 1
                    if countOf1stHalf < (mid + 1 - lowerBound):
                        # push [lowerBound, mid]
                        searchQueue.append((lowerBound, mid))
                    if countOf2ndHalf < (upperBound - mid):
                        # push [mid + 1, upperBound]
                        searchQueue.append((mid + 1, upperBound))                      
            return results                  
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except:
            print ("Unexpected Error: ", sys.exc_info()[0])
 
    # Approach 2 recursive internal implementation 
    def __DetectMissingNumber_R(self, numberStream, lowerBound,\
                                                        upperBound, memSize, results):
        try:
            if (upperBound - lowerBound) <= memSize*32:
                self.__FindMissingNumber(numberStream, lowerBound,\
                                upperBound, memSize, results)
                return

            numberStream.ResetToHead();
            mid = (lowerBound + upperBound)/2
            countOf1stHalf = 0
            countOf2ndHalf = 0
            while numberStream.HasNext():
                temp = numberStream.GetNext()
                if temp <= mid and temp >= lowerBound:
                    countOf1stHalf = countOf1stHalf + 1
                elif temp > mid and temp <= upperBound:
                    countOf2ndHalf = countOf2ndHalf + 1
            if countOf1stHalf < (mid + 1 - lowerBound):
                # sub problem [lowerBound, mid]
                self.__DetectMissingNumber_R(numberStream, lowerBound,\
                                mid, memSize, results)
            if countOf2ndHalf < (upperBound - mid):
                # sub problem [mid + 1, upperBound]
                self.__DetectMissingNumber_R(numberStream, mid + 1,
                                upperBound, memSize, results)          
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except:
            print ("Unexpected Error: ", sys.exc_info()[0])  

    # Approach 2 recursive implementation
    def DetectMissingNumber_R(self, numberStream, memSize = 1000):
        try:
            results = set()
            if numberStream.GetMissing() > 0:
                self.__DetectMissingNumber_R(numberStream, 1, \
                       numberStream.GetUpperBound(), memSize, results)
            return results
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except:              
            print ("Unexpected Error: ", sys.exc_info()[0])

    # Approach 3 implementation - divide ranges
    def __DivideRange(self, numberStream, lowerBound, upperBound, memSize, ranges):
        try:
            division = (upperBound + 1 - lowerBound) / (memSize << 5)
            ladders = sortedcontainers.SortedDict()
            ladders[lowerBound] = 0
            for idx in range(1, division):
                ladders[lowerBound + idx * (memSize << 5)] = 0
            if division * (memSize << 5) < upperBound:
                ladders[division * (memSize << 5) + lowerBound] = 0
         
            numberStream.ResetToHead()
            keys = ladders.keys();
            while numberStream.HasNext():
                temp = numberStream.GetNext()
                if temp >= lowerBound and temp <= upperBound:
                    lBound = ladders.bisect_left(temp)
                    uBound = ladders.bisect_right(temp)
                    if lBound != uBound:
                        key = keys[lBound];
                    else:
                        key = keys[uBound - 1]
                    ladders[key] = ladders[key] + 1

            for idx in range(0, len(keys) - 1) :
                if (keys[idx + 1] - keys[idx]) > ladders[keys[idx]]:
                    ranges.append((keys[idx], keys[idx + 1] - 1))
            if (upperBound - keys[len(keys) -1]) > ladders[keys[len(keys) - 1]]:
                ranges.append((keys[len(keys) - 1], upperBound))
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except IndexError as e:
            print ("IndexError: {0}".format(e))
        except:
            print ("Unexpected Error: ", sys.exc_info()[0])
 
    # Approach 3 implementation
    def DetectMissingNumber_Div(self, numberStream, memSize):
        try:
            results = set()
            if numberStream.GetMissing() == 0:
                return results
            searchQueue = []
            searchQueue.append((1, numberStream.GetUpperBound()))
            totalLinearDetectSize = memSize * 32
            while len(searchQueue) != 0:
                (lowerBound, upperBound) = searchQueue.pop()
                if (upperBound - lowerBound) <= totalLinearDetectSize:
                    # find missing numbers in small range (within mem size)
                    self.__FindMissingNumber(numberStream, lowerBound,\
                                    upperBound, memSize, results)
                else:
                    # divide ranges
                    self.__DivideRange(numberStream, lowerBound,\
                                    upperBound, memSize, searchQueue)
            return results                  
        except AttributeError as e:
            print ("AttributeError: {0}".format(e))
        except:
            print ("Unexpected Error: ", sys.exc_info()[0])

# ********************************************************************************
# Unit Test
# ********************************************************************************
import unittest
from MissingNumberDetector import MissingNumberDetector
from NumberStream import NumberStream
from NumberGenerator import NumberGenerator

class Test_MissingNumberDetector(unittest.TestCase):
    def test_MND(self):
        ns = NumberGenerator(1000000, 100)
        mbd = MissingNumberDetector()  
     
        results = mbd.DetectMissingNumber(ns, 1000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber(ns, 10000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber(ns, 100000)
        self.assertEqual(results, ns.GetMissingNumbers())

    def test_MND_R(self):
        ns = NumberGenerator(1000000, 100)
        mbd = MissingNumberDetector()  

        results = mbd.DetectMissingNumber_R(ns, 1000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber_R(ns, 10000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber_R(ns, 100000)
        self.assertEqual(results, ns.GetMissingNumbers())

    def test_MND_Div(self):
        ns = NumberGenerator(1000000, 100)
        mbd = MissingNumberDetector()  

        results = mbd.DetectMissingNumber_Div(ns, 1000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber_Div(ns, 10000)
        self.assertEqual(results, ns.GetMissingNumbers())

        results = mbd.DetectMissingNumber_Div(ns, 100000)
        self.assertEqual(results, ns.GetMissingNumbers())

if __name__ == '__main__':
    unittest.main()

No comments:

Post a Comment